From b5ba2a946b7707681c9871064a7a2d9a8f382a2d Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Tue, 9 Jul 2024 14:46:31 +0400 Subject: [PATCH 01/57] go.mod: fix dependency version Otherwise, the following fails: ~> go list -m all go: github.com/optiopay/kafka@v0.0.0-00010101000000-000000000000: invalid version: unknown revision 000000000000 Signed-off-by: Oleg Vasilev --- go.mod | 1 + 1 file changed, 1 insertion(+) diff --git a/go.mod b/go.mod index 92abe5ef7..62853eece 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/neondatabase/autoscaling go 1.21 replace ( + github.com/optiopay/kafka => github.com/optiopay/kafka v0.0.0 k8s.io/api => k8s.io/api v0.26.15 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.26.15 k8s.io/apimachinery => k8s.io/apimachinery v0.26.15 From 3b19cc5a1536d49533db49e2682ac8763267ef77 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Tue, 9 Jul 2024 14:57:17 +0400 Subject: [PATCH 02/57] lint: bump golangci-lint to v1.59.1 and fix new warnings Signed-off-by: Oleg Vasilev --- .github/workflows/lint.yml | 2 +- neonvm/apis/neonvm/v1/virtualmachine_types.go | 2 ++ pkg/api/types.go | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ace60b6fa..96d4cdde4 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -23,7 +23,7 @@ jobs: with: # Required: the version of golangci-lint is required and # should be specified with patch version. - version: v1.58.1 + version: v1.59.1 args: --timeout 5m github-token: ${{ secrets.github_token }} # caching issues, see: https://github.com/golangci/golangci-lint-action/issues/244#issuecomment-1052190775 diff --git a/neonvm/apis/neonvm/v1/virtualmachine_types.go b/neonvm/apis/neonvm/v1/virtualmachine_types.go index d197b1f85..01c56a935 100644 --- a/neonvm/apis/neonvm/v1/virtualmachine_types.go +++ b/neonvm/apis/neonvm/v1/virtualmachine_types.go @@ -353,8 +353,10 @@ func (m MilliCPU) MarshalJSON() ([]byte, error) { func (m MilliCPU) Format(state fmt.State, verb rune) { switch { case verb == 'v' && state.Flag('#'): + //nolint:errcheck // can't do anything about the write error state.Write([]byte(fmt.Sprintf("%v", uint32(m)))) default: + //nolint:errcheck // can't do anything about the write error state.Write([]byte(fmt.Sprintf("%v", m.AsFloat64()))) } } diff --git a/pkg/api/types.go b/pkg/api/types.go index abaa37eaf..14d62451e 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -272,8 +272,10 @@ func (b Bytes) MarshalJSON() ([]byte, error) { func (b Bytes) Format(state fmt.State, verb rune) { switch { case verb == 'v' && state.Flag('#'): + //nolint:errcheck // can't do anything about the write error state.Write([]byte(fmt.Sprintf("%v", uint64(b)))) default: + //nolint:errcheck // can't do anything about the write error state.Write([]byte(b.ToResourceQuantity().String())) } } From 805292d967ca04738716f6bdd33962344995d3df Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 20 Jun 2024 17:41:17 +0400 Subject: [PATCH 03/57] Implement scaling latency metrics through logical clock Signed-off-by: Oleg Vasilev --- .golangci.yml | 1 + neonvm/apis/neonvm/v1/virtualmachine_types.go | 40 ++++++ neonvm/controllers/vm_controller.go | 4 + pkg/agent/core/action.go | 23 ++-- pkg/agent/core/dumpstate.go | 26 ++-- pkg/agent/core/logiclock/logiclock.go | 67 ++++++++++ pkg/agent/core/logiclock/logiclock_test.go | 126 ++++++++++++++++++ pkg/agent/core/state.go | 109 ++++++++++++--- pkg/agent/core/state_test.go | 106 ++++++++++----- pkg/agent/core/testhelpers/construct.go | 28 +++- pkg/agent/execbridge.go | 10 +- pkg/agent/executor/core.go | 10 +- pkg/agent/executor/exec_monitor.go | 2 + pkg/agent/executor/exec_neonvm.go | 10 +- pkg/agent/executor/exec_plugin.go | 1 + pkg/agent/prommetrics.go | 9 ++ pkg/agent/runner.go | 19 ++- pkg/api/vminfo.go | 17 ++- pkg/plugin/run.go | 7 +- 19 files changed, 525 insertions(+), 90 deletions(-) create mode 100644 pkg/agent/core/logiclock/logiclock.go create mode 100644 pkg/agent/core/logiclock/logiclock_test.go diff --git a/.golangci.yml b/.golangci.yml index 4fa039051..78ffb616d 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -79,6 +79,7 @@ linters-settings: - '^github\.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1\.VirtualMachine(Migration)?(Spec)?$' - '^github\.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1\.IPPool$' - '^github\.com/neondatabase/autoscaling/pkg/agent/core\.ActionSet$' + - '^github\.com/neondatabase/autoscaling/pkg/agent/core\.Action.*$' - '^github\.com/neondatabase/autoscaling/pkg/util/patch\.Operation$' - '^github\.com/neondatabase/autoscaling/pkg/util/watch\.HandlerFuncs$' diff --git a/neonvm/apis/neonvm/v1/virtualmachine_types.go b/neonvm/apis/neonvm/v1/virtualmachine_types.go index 01c56a935..144e7ff8b 100644 --- a/neonvm/apis/neonvm/v1/virtualmachine_types.go +++ b/neonvm/apis/neonvm/v1/virtualmachine_types.go @@ -21,6 +21,7 @@ import ( "errors" "fmt" "slices" + "time" "github.com/samber/lo" @@ -191,6 +192,10 @@ type Guest struct { // +optional Ports []Port `json:"ports,omitempty"` + // Logical clock value corresponding to the desired resources of the VM. + // +optional + DesiredLogicalTime *LogicalTime `json:"desiredLogicalTime,omitempty"` + // Additional settings for the VM. // Cannot be updated. // +optional @@ -215,6 +220,39 @@ func (g Guest) ValidateForMemoryProvider(p MemoryProvider) error { return nil } +// LogicalTime allows to track progress of changes to a VM. +type LogicalTime struct { + Value int64 `json:"value"` + UpdatedAt metav1.Time `json:"updatedAt"` +} + +func (t *LogicalTime) Rewind(now time.Time) *LogicalTime { + if t == nil { + return nil + } + return &LogicalTime{ + Value: t.Value, + UpdatedAt: metav1.NewTime(now), + } +} + +func (t *LogicalTime) RewindNow() *LogicalTime { + return t.Rewind(time.Now()) +} + +func EarliestLogicalTime(ts ...*LogicalTime) *LogicalTime { + var earliest *LogicalTime + for _, t := range ts { + if t == nil { + return nil + } + if earliest == nil || t.UpdatedAt.Before(&earliest.UpdatedAt) { + earliest = t + } + } + return earliest +} + type GuestSettings struct { // Individual lines to add to a sysctl.conf file. See sysctl.conf(5) for more // +optional @@ -534,6 +572,8 @@ type VirtualMachineStatus struct { MemoryProvider *MemoryProvider `json:"memoryProvider,omitempty"` // +optional SSHSecretName string `json:"sshSecretName,omitempty"` + // +optional + CurrentLogicalTime *LogicalTime `json:"currentLogicalTime,omitempty"` } type VmPhase string diff --git a/neonvm/controllers/vm_controller.go b/neonvm/controllers/vm_controller.go index fc2bb3d65..573f1957e 100644 --- a/neonvm/controllers/vm_controller.go +++ b/neonvm/controllers/vm_controller.go @@ -800,6 +800,10 @@ func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine) // do nothing } + if vm.Status.Phase == vmv1.VmRunning { + vm.Status.CurrentLogicalTime = vm.Spec.Guest.DesiredLogicalTime.RewindNow() + } + return nil } diff --git a/pkg/agent/core/action.go b/pkg/agent/core/action.go index 04e6c03e1..294b94a0f 100644 --- a/pkg/agent/core/action.go +++ b/pkg/agent/core/action.go @@ -5,6 +5,7 @@ import ( "go.uber.org/zap/zapcore" + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "github.com/neondatabase/autoscaling/pkg/api" ) @@ -21,24 +22,28 @@ type ActionWait struct { } type ActionPluginRequest struct { - LastPermit *api.Resources `json:"current"` - Target api.Resources `json:"target"` - Metrics *api.Metrics `json:"metrics"` + LastPermit *api.Resources `json:"current"` + Target api.Resources `json:"target"` + Metrics *api.Metrics `json:"metrics"` + DesiredLogicalTime *vmv1.LogicalTime `json:"desiredLogicalTime"` } type ActionNeonVMRequest struct { - Current api.Resources `json:"current"` - Target api.Resources `json:"target"` + Current api.Resources `json:"current"` + Target api.Resources `json:"target"` + DesiredLogicalTime *vmv1.LogicalTime `json:"desiredLogicalTime"` } type ActionMonitorDownscale struct { - Current api.Resources `json:"current"` - Target api.Resources `json:"target"` + Current api.Resources `json:"current"` + Target api.Resources `json:"target"` + DesiredLogicalTime *vmv1.LogicalTime `json:"desiredLogicalTime"` } type ActionMonitorUpscale struct { - Current api.Resources `json:"current"` - Target api.Resources `json:"target"` + Current api.Resources `json:"current"` + Target api.Resources `json:"target"` + DesiredLogicalTime *vmv1.LogicalTime `json:"desiredLogicalTime"` } func addObjectPtr[T zapcore.ObjectMarshaler](enc zapcore.ObjectEncoder, key string, value *T) error { diff --git a/pkg/agent/core/dumpstate.go b/pkg/agent/core/dumpstate.go index 63a8d0ce3..ca862c4da 100644 --- a/pkg/agent/core/dumpstate.go +++ b/pkg/agent/core/dumpstate.go @@ -6,6 +6,7 @@ import ( "encoding/json" "time" + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "github.com/neondatabase/autoscaling/pkg/api" ) @@ -33,23 +34,25 @@ func (d StateDump) MarshalJSON() ([]byte, error) { func (s *State) Dump() StateDump { return StateDump{ internal: state{ - Debug: s.internal.Debug, - Config: s.internal.Config, - VM: s.internal.VM, - Plugin: s.internal.Plugin.deepCopy(), - Monitor: s.internal.Monitor.deepCopy(), - NeonVM: s.internal.NeonVM.deepCopy(), - Metrics: shallowCopy[SystemMetrics](s.internal.Metrics), + Debug: s.internal.Debug, + Config: s.internal.Config, + VM: s.internal.VM, + Plugin: s.internal.Plugin.deepCopy(), + Monitor: s.internal.Monitor.deepCopy(), + NeonVM: s.internal.NeonVM.deepCopy(), + Metrics: shallowCopy[SystemMetrics](s.internal.Metrics), + ClockSource: s.internal.ClockSource, }, } } func (s *pluginState) deepCopy() pluginState { return pluginState{ - OngoingRequest: s.OngoingRequest, - LastRequest: shallowCopy[pluginRequested](s.LastRequest), - LastFailureAt: shallowCopy[time.Time](s.LastFailureAt), - Permit: shallowCopy[api.Resources](s.Permit), + OngoingRequest: s.OngoingRequest, + LastRequest: shallowCopy[pluginRequested](s.LastRequest), + LastFailureAt: shallowCopy[time.Time](s.LastFailureAt), + Permit: shallowCopy[api.Resources](s.Permit), + CurrentLogicalTime: shallowCopy[vmv1.LogicalTime](s.CurrentLogicalTime), } } @@ -61,6 +64,7 @@ func (s *monitorState) deepCopy() monitorState { Approved: shallowCopy[api.Resources](s.Approved), DownscaleFailureAt: shallowCopy[time.Time](s.DownscaleFailureAt), UpscaleFailureAt: shallowCopy[time.Time](s.UpscaleFailureAt), + CurrentLogicalTime: shallowCopy[vmv1.LogicalTime](s.CurrentLogicalTime), } } diff --git a/pkg/agent/core/logiclock/logiclock.go b/pkg/agent/core/logiclock/logiclock.go new file mode 100644 index 000000000..24d34c25e --- /dev/null +++ b/pkg/agent/core/logiclock/logiclock.go @@ -0,0 +1,67 @@ +package logiclock + +import ( + "errors" + "time" + + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" +) + +type Clock struct { + cb func(time.Duration) + times []time.Time + offset int64 +} + +func NewClock(cb func(time.Duration)) *Clock { + return &Clock{ + cb: cb, + times: nil, + offset: 0, + } +} + +func (c *Clock) NextValue() int64 { + return c.offset + int64(len(c.times)) +} + +func (c *Clock) Next(now time.Time) *vmv1.LogicalTime { + ret := vmv1.LogicalTime{ + Value: c.NextValue(), + UpdatedAt: v1.NewTime(now), + } + c.times = append(c.times, ret.UpdatedAt.Time) + return &ret +} + +func (c *Clock) Observe(logicalTime *vmv1.LogicalTime) error { + if logicalTime == nil { + return nil + } + if logicalTime.Value < c.offset { + return nil + } + + idx := logicalTime.Value - c.offset + if idx > int64(len(c.times)) { + return errors.New("logicalTime value is in the future") + } + + diff := logicalTime.UpdatedAt.Time.Sub(c.times[idx]) + + if c.cb != nil { + c.cb(diff) + } + + c.offset = logicalTime.Value + 1 + c.times = c.times[idx+1:] + + return nil +} + +type NilClock struct{} + +func (c *NilClock) Next(now time.Time) *vmv1.LogicalTime { return nil } +func (c *NilClock) Observe(_ *vmv1.LogicalTime) error { return nil } diff --git a/pkg/agent/core/logiclock/logiclock_test.go b/pkg/agent/core/logiclock/logiclock_test.go new file mode 100644 index 000000000..e8ec9ca80 --- /dev/null +++ b/pkg/agent/core/logiclock/logiclock_test.go @@ -0,0 +1,126 @@ +package logiclock_test + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" + "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" +) + +type testClockMetric struct { + *logiclock.Clock + t *testing.T + now v1.Time + result *time.Duration +} + +func (tcm *testClockMetric) advance(d time.Duration) { + tcm.now = v1.NewTime(tcm.now.Add(d)) +} + +func (tcm *testClockMetric) assertResult(d time.Duration) { + require.NotNil(tcm.t, tcm.result) + assert.Equal(tcm.t, d, *tcm.result) + tcm.result = nil +} + +func (tcm *testClockMetric) nextNow() *vmv1.LogicalTime { + return tcm.Next(tcm.now.Time) +} + +func newTestClockMetric(t *testing.T) *testClockMetric { + tcm := &testClockMetric{ + Clock: nil, + t: t, + now: v1.NewTime(time.Now()), + result: nil, + } + + cb := func(d time.Duration) { + tcm.result = &d + } + tcm.Clock = logiclock.NewClock(cb) + + return tcm +} + +func TestClockMetric(t *testing.T) { + tcm := newTestClockMetric(t) + + // Generate new clock + cl := tcm.nextNow() + assert.Equal(t, int64(0), cl.Value) + + // Observe it coming back in 5 seconds + tcm.advance(5 * time.Second) + err := tcm.Observe(&vmv1.LogicalTime{ + Value: 0, + UpdatedAt: tcm.now, + }) + assert.NoError(t, err) + tcm.assertResult(5 * time.Second) +} + +func TestClockMetricSkip(t *testing.T) { + tcm := newTestClockMetric(t) + + // Generate new clock + cl := tcm.nextNow() + assert.Equal(t, int64(0), cl.Value) + + // Generate another one + tcm.advance(5 * time.Second) + cl = tcm.nextNow() + assert.Equal(t, int64(1), cl.Value) + + // Observe the first one + tcm.advance(5 * time.Second) + err := tcm.Observe(&vmv1.LogicalTime{ + Value: 0, + UpdatedAt: tcm.now, + }) + assert.NoError(t, err) + tcm.assertResult(10 * time.Second) + + // Observe the second one + tcm.advance(2 * time.Second) + err = tcm.Observe(&vmv1.LogicalTime{ + Value: 1, + UpdatedAt: tcm.now, + }) + assert.NoError(t, err) + tcm.assertResult(7 * time.Second) +} + +func TestClockMetricStale(t *testing.T) { + tcm := newTestClockMetric(t) + + // Generate new clock + cl := tcm.nextNow() + assert.Equal(t, int64(0), cl.Value) + + // Observe it coming back in 5 seconds + tcm.advance(5 * time.Second) + err := tcm.Observe(&vmv1.LogicalTime{ + Value: 0, + UpdatedAt: tcm.now, + }) + assert.NoError(t, err) + tcm.assertResult(5 * time.Second) + + // Observe it coming back again + tcm.advance(5 * time.Second) + err = tcm.Observe(&vmv1.LogicalTime{ + Value: 0, + UpdatedAt: tcm.now, + }) + // No error, but no result either + assert.NoError(t, err) + assert.Nil(t, tcm.result) +} diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 5a3d27d73..f6fa38be5 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -30,6 +30,7 @@ import ( "github.com/samber/lo" "go.uber.org/zap" + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "github.com/neondatabase/autoscaling/pkg/api" "github.com/neondatabase/autoscaling/pkg/util" ) @@ -114,6 +115,8 @@ type state struct { NeonVM neonvmState Metrics *SystemMetrics + + ClockSource LogicClock `json:"-"` } type pluginState struct { @@ -127,6 +130,8 @@ type pluginState struct { // Permit, if not nil, stores the Permit in the most recent PluginResponse. This field will be // nil if we have not been able to contact *any* scheduler. Permit *api.Resources + + CurrentLogicalTime *vmv1.LogicalTime } type pluginRequested struct { @@ -155,6 +160,8 @@ type monitorState struct { // UpscaleFailureAt, if not nil, stores the time at which an upscale request most recently // failed UpscaleFailureAt *time.Time + + CurrentLogicalTime *vmv1.LogicalTime } func (ms *monitorState) active() bool { @@ -196,17 +203,23 @@ func (ns *neonvmState) ongoingRequest() bool { return ns.OngoingRequested != nil } -func NewState(vm api.VmInfo, config Config) *State { - return &State{ +type LogicClock interface { + Next(ts time.Time) *vmv1.LogicalTime + Observe(logicalTime *vmv1.LogicalTime) error +} + +func NewState(vm api.VmInfo, config Config, clockSource LogicClock) *State { + state := &State{ internal: state{ Config: config, Debug: false, VM: vm, Plugin: pluginState{ - OngoingRequest: false, - LastRequest: nil, - LastFailureAt: nil, - Permit: nil, + OngoingRequest: false, + LastRequest: nil, + LastFailureAt: nil, + Permit: nil, + CurrentLogicalTime: nil, }, Monitor: monitorState{ OngoingRequest: nil, @@ -215,15 +228,19 @@ func NewState(vm api.VmInfo, config Config) *State { Approved: nil, DownscaleFailureAt: nil, UpscaleFailureAt: nil, + CurrentLogicalTime: nil, }, NeonVM: neonvmState{ LastSuccess: nil, OngoingRequested: nil, RequestFailedAt: nil, }, - Metrics: nil, + Metrics: nil, + ClockSource: clockSource, }, } + + return state } func (s *state) info(msg string, fields ...zap.Field) { @@ -256,11 +273,12 @@ func (s *state) nextActions(now time.Time) ActionSet { // our handling later on is easier if we can assume it's non-nil calcDesiredResourcesWait = func(ActionSet) *time.Duration { return nil } } + desiredLogicalTime := s.ClockSource.Next(now) // ---- // Requests to the scheduler plugin: var pluginRequiredWait *time.Duration - actions.PluginRequest, pluginRequiredWait = s.calculatePluginAction(now, desiredResources) + actions.PluginRequest, pluginRequiredWait = s.calculatePluginAction(now, desiredResources, desiredLogicalTime) // ---- // Requests to NeonVM: @@ -274,7 +292,7 @@ func (s *state) nextActions(now time.Time) ActionSet { pluginRequestedPhase = "planned" } var neonvmRequiredWait *time.Duration - actions.NeonVMRequest, neonvmRequiredWait = s.calculateNeonVMAction(now, desiredResources, pluginRequested, pluginRequestedPhase) + actions.NeonVMRequest, neonvmRequiredWait = s.calculateNeonVMAction(now, desiredResources, pluginRequested, pluginRequestedPhase, desiredLogicalTime) // ---- // Requests to vm-monitor (upscaling) @@ -283,13 +301,19 @@ func (s *state) nextActions(now time.Time) ActionSet { // forego notifying the vm-monitor of increased resources because we were busy asking if it // could downscale. var monitorUpscaleRequiredWait *time.Duration - actions.MonitorUpscale, monitorUpscaleRequiredWait = s.calculateMonitorUpscaleAction(now, desiredResources) + actions.MonitorUpscale, monitorUpscaleRequiredWait = s.calculateMonitorUpscaleAction(now, desiredResources, desiredLogicalTime) // ---- // Requests to vm-monitor (downscaling) plannedUpscale := actions.MonitorUpscale != nil var monitorDownscaleRequiredWait *time.Duration - actions.MonitorDownscale, monitorDownscaleRequiredWait = s.calculateMonitorDownscaleAction(now, desiredResources, plannedUpscale) + actions.MonitorDownscale, monitorDownscaleRequiredWait = + s.calculateMonitorDownscaleAction( + now, + desiredResources, + plannedUpscale, + desiredLogicalTime, + ) // --- and that's all the request types! --- @@ -322,6 +346,7 @@ func (s *state) nextActions(now time.Time) ActionSet { func (s *state) calculatePluginAction( now time.Time, desiredResources api.Resources, + desiredLogicalTime *vmv1.LogicalTime, ) (*ActionPluginRequest, *time.Duration) { logFailureReason := func(reason string) { s.warnf("Wanted to make a request to the scheduler plugin, but %s", reason) @@ -409,6 +434,7 @@ func (s *state) calculatePluginAction( return nil } }(), + DesiredLogicalTime: desiredLogicalTime, }, nil } else { if wantToRequestNewResources && waitingOnRetryBackoff { @@ -429,7 +455,23 @@ func (s *state) calculateNeonVMAction( desiredResources api.Resources, pluginRequested *api.Resources, pluginRequestedPhase string, + logicalTime *vmv1.LogicalTime, ) (*ActionNeonVMRequest, *time.Duration) { + + desiredTimeCandidates := []*vmv1.LogicalTime{logicalTime} + + if desiredResources.HasFieldLessThan(s.VM.Using()) { + // We are downscaling, so we needed a permit from monitor + desiredTimeCandidates = append(desiredTimeCandidates, s.Monitor.CurrentLogicalTime) + } + + if desiredResources.HasFieldGreaterThan(s.VM.Using()) { + // We are upscaling, so we needed a permit from the plugin + desiredTimeCandidates = append(desiredTimeCandidates, s.Plugin.CurrentLogicalTime) + } + + desiredTime := vmv1.EarliestLogicalTime(desiredTimeCandidates...) + // clamp desiredResources to what we're allowed to make a request for desiredResources = s.clampResources( s.VM.Using(), // current: what we're using already @@ -457,8 +499,9 @@ func (s *state) calculateNeonVMAction( } return &ActionNeonVMRequest{ - Current: s.VM.Using(), - Target: desiredResources, + Current: s.VM.Using(), + Target: desiredResources, + DesiredLogicalTime: desiredTime, }, nil } else { var reqs []string @@ -480,6 +523,7 @@ func (s *state) calculateNeonVMAction( func (s *state) calculateMonitorUpscaleAction( now time.Time, desiredResources api.Resources, + desiredLogicalTime *vmv1.LogicalTime, ) (*ActionMonitorUpscale, *time.Duration) { // can't do anything if we don't have an active connection to the vm-monitor if !s.Monitor.active() { @@ -541,8 +585,9 @@ func (s *state) calculateMonitorUpscaleAction( // Otherwise, we can make the request: return &ActionMonitorUpscale{ - Current: *s.Monitor.Approved, - Target: requestResources, + Current: *s.Monitor.Approved, + Target: requestResources, + DesiredLogicalTime: desiredLogicalTime, }, nil } @@ -550,6 +595,7 @@ func (s *state) calculateMonitorDownscaleAction( now time.Time, desiredResources api.Resources, plannedUpscaleRequest bool, + desiredLogicalTime *vmv1.LogicalTime, ) (*ActionMonitorDownscale, *time.Duration) { // can't do anything if we don't have an active connection to the vm-monitor if !s.Monitor.active() { @@ -627,8 +673,9 @@ func (s *state) calculateMonitorDownscaleAction( // Nothing else to check, we're good to make the request return &ActionMonitorDownscale{ - Current: *s.Monitor.Approved, - Target: requestResources, + Current: *s.Monitor.Approved, + Target: requestResources, + DesiredLogicalTime: desiredLogicalTime, }, nil } @@ -792,6 +839,8 @@ func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) ( } } + s.updateCurrentClock(s.VM.CurrentLogicalTime) + s.info("Calculated desired resources", zap.Object("current", s.VM.Using()), zap.Object("target", result)) return result, calculateWaitTime @@ -911,6 +960,13 @@ func (s *state) pluginApprovedUpperBound() api.Resources { } } +func (s *state) updateCurrentClock(logicalTime *vmv1.LogicalTime) { + err := s.ClockSource.Observe(logicalTime) + if err != nil { + s.warnf("Failed to observe clock source: %v", err) + } +} + ////////////////////////////////////////// // PUBLIC FUNCTIONS TO UPDATE THE STATE // ////////////////////////////////////////// @@ -964,8 +1020,12 @@ func (h PluginHandle) RequestFailed(now time.Time) { h.s.Plugin.LastFailureAt = &now } -func (h PluginHandle) RequestSuccessful(now time.Time, resp api.PluginResponse) (_err error) { +func (h PluginHandle) RequestSuccessful( + now time.Time, + resp api.PluginResponse, +) (_err error) { h.s.Plugin.OngoingRequest = false + defer func() { if _err != nil { h.s.Plugin.LastFailureAt = &now @@ -998,6 +1058,10 @@ func (h PluginHandle) RequestSuccessful(now time.Time, resp api.PluginResponse) return nil } +func (h PluginHandle) UpdateLogicalTime(currentTime *vmv1.LogicalTime) { + h.s.Plugin.CurrentLogicalTime = currentTime +} + // MonitorHandle provides write access to the vm-monitor pieces of an UpdateState type MonitorHandle struct { s *state @@ -1015,6 +1079,7 @@ func (h MonitorHandle) Reset() { Approved: nil, DownscaleFailureAt: nil, UpscaleFailureAt: nil, + CurrentLogicalTime: nil, } } @@ -1066,6 +1131,10 @@ func (h MonitorHandle) DownscaleRequestAllowed(now time.Time) { h.s.Monitor.OngoingRequest = nil } +func (h MonitorHandle) UpdateLogicalTime(currentTime *vmv1.LogicalTime) { + h.s.Monitor.CurrentLogicalTime = currentTime +} + // Downscale request was successful but the monitor denied our request. func (h MonitorHandle) DownscaleRequestDenied(now time.Time) { h.s.Monitor.DeniedDownscale = &deniedDownscale{ @@ -1089,6 +1158,10 @@ func (s *State) NeonVM() NeonVMHandle { return NeonVMHandle{&s.internal} } +func (s *State) UpdateCurrentClock(logicalTime *vmv1.LogicalTime) { + s.internal.updateCurrentClock(logicalTime) +} + func (h NeonVMHandle) StartingRequest(now time.Time, resources api.Resources) { // FIXME: add time to ongoing request info (or maybe only in RequestFailed?) h.s.NeonVM.OngoingRequested = &resources diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 7684b1add..6ad3a68fa 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -9,7 +9,11 @@ import ( "go.uber.org/zap" "golang.org/x/exp/slices" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "github.com/neondatabase/autoscaling/pkg/agent/core" + "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" helpers "github.com/neondatabase/autoscaling/pkg/agent/core/testhelpers" "github.com/neondatabase/autoscaling/pkg/api" ) @@ -82,6 +86,8 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { for _, c := range cases { warnings := []string{} + source := logiclock.NewClock(nil) + state := core.NewState( api.VmInfo{ Name: "test", @@ -104,6 +110,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { ScalingEnabled: true, ScalingConfig: nil, }, + CurrentLogicalTime: nil, }, core.Config{ ComputeUnit: api.Resources{VCPU: 250, Mem: 1 * slotSize}, @@ -127,6 +134,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { }, }, }, + source, ) t.Run(c.name, func(t *testing.T) { @@ -202,19 +210,24 @@ func getDesiredResources(state *core.State, now time.Time) api.Resources { return res } -func doInitialPluginRequest( - a helpers.Assert, - state *core.State, - clock *helpers.FakeClock, - requestTime time.Duration, - metrics *api.Metrics, - resources api.Resources, -) { +func logicalTime(clock *helpers.FakeClock, value int64) *vmv1.LogicalTime { + return &vmv1.LogicalTime{ + Value: value, + UpdatedAt: v1.NewTime(clock.Now()), + } +} + +func doInitialPluginRequest(a helpers.Assert, state *core.State, clock *helpers.FakeClock, requestTime time.Duration, metrics *api.Metrics, resources api.Resources, enableLogicalClock bool) { + var lt *vmv1.LogicalTime + if enableLogicalClock { + lt = logicalTime(clock, 0) + } a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: nil, - Target: resources, - Metrics: metrics, + LastPermit: nil, + Target: resources, + Metrics: metrics, + DesiredLogicalTime: lt, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resources) @@ -243,10 +256,13 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { } resForCU := DefaultComputeUnit.Mul + logicClock := logiclock.NewClock(nil) + state := helpers.CreateInitialState( DefaultInitialStateConfig, helpers.WithStoredWarnings(a.StoredWarnings()), helpers.WithTestingLogfWarnings(t), + helpers.WithClock(logicClock), ) nextActions := func() core.ActionSet { return state.NextActions(clock.Now()) @@ -255,7 +271,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { state.Monitor().Active(true) // Send initial scheduler request: - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1), true) // Set metrics clockTick().AssertEquals(duration("0.2s")) @@ -268,13 +284,16 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { a.Call(getDesiredResources, state, clock.Now()). Equals(resForCU(2)) + lt := logicalTime(clock, 1) + // Now that the initial scheduler request is done, and we have metrics that indicate // scale-up would be a good idea, we should be contacting the scheduler to get approval. a.Call(nextActions).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(1)), - Target: resForCU(2), - Metrics: lo.ToPtr(lastMetrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(1)), + Target: resForCU(2), + Metrics: lo.ToPtr(lastMetrics.ToAPI()), + DesiredLogicalTime: lt, }, }) // start the request: @@ -286,6 +305,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Permit: resForCU(2), Migrate: nil, }) + state.Plugin().UpdateLogicalTime(lt) // Scheduler approval is done, now we should be making the request to NeonVM a.Call(nextActions).Equals(core.ActionSet{ @@ -294,8 +314,9 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { // the next scheduler request. Wait: &core.ActionWait{Duration: duration("4.9s")}, NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(1), - Target: resForCU(2), + Current: resForCU(1), + Target: resForCU(2), + DesiredLogicalTime: lt, }, }) // start the request: @@ -306,13 +327,17 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Wait: &core.ActionWait{Duration: duration("4.8s")}, }) a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + state.UpdateCurrentClock(lt) + + lt = logicalTime(clock, 5) // NeonVM change is done, now we should finish by notifying the vm-monitor a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.8s")}, // same as previous, clock hasn't changed MonitorUpscale: &core.ActionMonitorUpscale{ - Current: resForCU(1), - Target: resForCU(2), + Current: resForCU(1), + Target: resForCU(2), + DesiredLogicalTime: lt, }, }) // start the request: @@ -323,6 +348,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Wait: &core.ActionWait{Duration: duration("4.7s")}, }) a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) + state.Monitor().UpdateLogicalTime(lt) // And now, double-check that there's no sneaky follow-up actions before we change the // metrics @@ -344,12 +370,15 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { a.Call(getDesiredResources, state, clock.Now()). Equals(resForCU(1)) + lt = logicalTime(clock, 8) + // First step in downscaling is getting approval from the vm-monitor: a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.6s")}, MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(2), - Target: resForCU(1), + Current: resForCU(2), + Target: resForCU(1), + DesiredLogicalTime: lt, }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) @@ -359,13 +388,15 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Wait: &core.ActionWait{Duration: duration("4.5s")}, }) a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) + state.Monitor().UpdateLogicalTime(lt) // After getting approval from the vm-monitor, we make the request to NeonVM to carry it out a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.5s")}, // same as previous, clock hasn't changed NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(2), - Target: resForCU(1), + Current: resForCU(2), + Target: resForCU(1), + DesiredLogicalTime: lt, }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) @@ -376,12 +407,15 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { }) a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + lt = logicalTime(clock, 12) + // Request to NeonVM completed, it's time to inform the scheduler plugin: a.Call(nextActions).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(2)), - Target: resForCU(1), - Metrics: lo.ToPtr(lastMetrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(2)), + Target: resForCU(1), + Metrics: lo.ToPtr(lastMetrics.ToAPI()), + DesiredLogicalTime: lt, }, // shouldn't have anything to say to the other components }) @@ -393,6 +427,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Permit: resForCU(1), Migrate: nil, }) + state.Plugin().UpdateLogicalTime(lt) // Finally, check there's no leftover actions: a.Call(nextActions).Equals(core.ActionSet{ @@ -428,7 +463,7 @@ func TestPeriodicPluginRequest(t *testing.T) { reqEvery := DefaultInitialStateConfig.Core.PluginRequestTick endTime := duration("20s") - doInitialPluginRequest(a, state, clock, clockTick, lo.ToPtr(metrics.ToAPI()), resources) + doInitialPluginRequest(a, state, clock, clockTick, lo.ToPtr(metrics.ToAPI()), resources, false) for clock.Elapsed().Duration < endTime { timeSinceScheduledRequest := (clock.Elapsed().Duration - base) % reqEvery @@ -489,7 +524,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { state.Monitor().Active(true) - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(6)) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(6), false) // Set metrics clockTick() @@ -580,8 +615,9 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("2.3s")}, NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(6), - Target: resForCU(3), + Current: resForCU(6), + Target: resForCU(3), + DesiredLogicalTime: nil, }, }) // Make the request: @@ -749,7 +785,7 @@ func TestRequestedUpscale(t *testing.T) { state.Monitor().Active(true) // Send initial scheduler request: - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1), false) // Set metrics clockTick() @@ -1013,7 +1049,7 @@ func TestDownscalePivotBack(t *testing.T) { state.Monitor().Active(true) - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2), false) clockTick().AssertEquals(duration("0.2s")) pluginWait := duration("4.8s") @@ -1067,7 +1103,7 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { state.Monitor().Active(true) // Send initial scheduler request: - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2), false) clockTick() @@ -1161,7 +1197,7 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) { state.Monitor().Active(true) // Send initial scheduler request: - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2), false) clockTick() @@ -1257,7 +1293,7 @@ func TestFailedRequestRetry(t *testing.T) { state.Monitor().Active(true) // Send initial scheduler request - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1), false) // Set metrics so that we should be trying to upscale clockTick() diff --git a/pkg/agent/core/testhelpers/construct.go b/pkg/agent/core/testhelpers/construct.go index 4aced58a5..6c1468349 100644 --- a/pkg/agent/core/testhelpers/construct.go +++ b/pkg/agent/core/testhelpers/construct.go @@ -8,6 +8,7 @@ import ( vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "github.com/neondatabase/autoscaling/pkg/agent/core" + "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" "github.com/neondatabase/autoscaling/pkg/api" ) @@ -36,12 +37,24 @@ type VmInfoOpt interface { modifyVmInfoWithConfig(InitialVmInfoConfig, *api.VmInfo) } +type ClockSourceOpt interface { + InitialStateOpt + + clock() core.LogicClock +} + func CreateInitialState(config InitialStateConfig, opts ...InitialStateOpt) *core.State { vmOpts := []VmInfoOpt{} + var clock core.LogicClock + + clock = &logiclock.NilClock{} for _, o := range opts { if vo, ok := o.(VmInfoOpt); ok { vmOpts = append(vmOpts, vo) } + if co, ok := o.(ClockSourceOpt); ok { + clock = co.clock() + } } vm := CreateVmInfo(config.VM, vmOpts...) @@ -50,7 +63,7 @@ func CreateInitialState(config InitialStateConfig, opts ...InitialStateOpt) *cor o.modifyStateConfig(&config.Core) } - return core.NewState(vm, config.Core) + return core.NewState(vm, config.Core, clock) } func CreateVmInfo(config InitialVmInfoConfig, opts ...VmInfoOpt) api.VmInfo { @@ -86,6 +99,7 @@ func CreateVmInfo(config InitialVmInfoConfig, opts ...VmInfoOpt) api.VmInfo { ScalingConfig: nil, ScalingEnabled: true, }, + CurrentLogicalTime: nil, } for _, o := range opts { @@ -98,6 +112,7 @@ func CreateVmInfo(config InitialVmInfoConfig, opts ...VmInfoOpt) api.VmInfo { type coreConfigModifier func(*core.Config) type vmInfoConfigModifier func(*InitialVmInfoConfig) type vmInfoModifier func(InitialVmInfoConfig, *api.VmInfo) +type clockInjector func() core.LogicClock var ( _ VmInfoOpt = vmInfoConfigModifier(nil) @@ -118,6 +133,11 @@ func (m vmInfoModifier) modifyVmInfoWithConfig(c InitialVmInfoConfig, vm *api.Vm (func(InitialVmInfoConfig, *api.VmInfo))(m)(c, vm) } +func (m clockInjector) modifyStateConfig(*core.Config) {} +func (m clockInjector) clock() core.LogicClock { + return m() +} + func WithConfigSetting(f func(*core.Config)) InitialStateOpt { return coreConfigModifier(f) } @@ -158,3 +178,9 @@ func WithCurrentCU(cu uint16) VmInfoOpt { vm.SetUsing(c.ComputeUnit.Mul(cu)) }) } + +func WithClock(c core.LogicClock) ClockSourceOpt { + return clockInjector(func() core.LogicClock { + return c + }) +} diff --git a/pkg/agent/execbridge.go b/pkg/agent/execbridge.go index 7ea6d89dc..b9a503bd4 100644 --- a/pkg/agent/execbridge.go +++ b/pkg/agent/execbridge.go @@ -11,6 +11,7 @@ import ( "go.uber.org/zap" + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "github.com/neondatabase/autoscaling/pkg/agent/executor" "github.com/neondatabase/autoscaling/pkg/api" ) @@ -86,10 +87,15 @@ func makeNeonVMInterface(r *Runner) *execNeonVMInterface { } // Request implements executor.NeonVMInterface -func (iface *execNeonVMInterface) Request(ctx context.Context, logger *zap.Logger, current, target api.Resources) error { +func (iface *execNeonVMInterface) Request( + ctx context.Context, + logger *zap.Logger, + current, target api.Resources, + desiredLogicalTime *vmv1.LogicalTime, +) error { iface.runner.recordResourceChange(current, target, iface.runner.global.metrics.neonvmRequestedChange) - err := iface.runner.doNeonVMRequest(ctx, target) + err := iface.runner.doNeonVMRequest(ctx, target, desiredLogicalTime) if err != nil { iface.runner.status.update(iface.runner.global, func(ps podStatus) podStatus { ps.failedNeonVMRequestCounter.Inc() diff --git a/pkg/agent/executor/core.go b/pkg/agent/executor/core.go index ef8a22a22..9e160d34f 100644 --- a/pkg/agent/executor/core.go +++ b/pkg/agent/executor/core.go @@ -19,6 +19,7 @@ import ( "go.uber.org/zap" "github.com/neondatabase/autoscaling/pkg/agent/core" + "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" "github.com/neondatabase/autoscaling/pkg/api" "github.com/neondatabase/autoscaling/pkg/util" ) @@ -53,11 +54,16 @@ type ClientSet struct { Monitor MonitorInterface } -func NewExecutorCore(stateLogger *zap.Logger, vm api.VmInfo, config Config) *ExecutorCore { +func NewExecutorCore( + stateLogger *zap.Logger, + vm api.VmInfo, + config Config, + clockSource *logiclock.Clock, +) *ExecutorCore { return &ExecutorCore{ mu: sync.Mutex{}, stateLogger: stateLogger, - core: core.NewState(vm, config.Core), + core: core.NewState(vm, config.Core, clockSource), actions: nil, // (*ExecutorCore).getActions() checks if this is nil lastActionsID: -1, onNextActions: config.OnNextActions, diff --git a/pkg/agent/executor/exec_monitor.go b/pkg/agent/executor/exec_monitor.go index 4431a5bcb..8595c7795 100644 --- a/pkg/agent/executor/exec_monitor.go +++ b/pkg/agent/executor/exec_monitor.go @@ -107,6 +107,7 @@ func (c *ExecutorCoreWithClients) DoMonitorDownscales(ctx context.Context, logge logger.Info("vm-monitor approved downscale", logFields...) if unchanged { state.Monitor().DownscaleRequestAllowed(endTime) + state.Monitor().UpdateLogicalTime(action.DesiredLogicalTime.Rewind(endTime)) } else { warnSkipBecauseChanged() } @@ -187,6 +188,7 @@ func (c *ExecutorCoreWithClients) DoMonitorUpscales(ctx context.Context, logger logger.Info("vm-monitor upscale request successful", logFields...) if unchanged { state.Monitor().UpscaleRequestSuccessful(endTime) + state.Monitor().UpdateLogicalTime(action.DesiredLogicalTime.Rewind(endTime)) } else { warnSkipBecauseChanged() } diff --git a/pkg/agent/executor/exec_neonvm.go b/pkg/agent/executor/exec_neonvm.go index 7d4eecd4a..442360019 100644 --- a/pkg/agent/executor/exec_neonvm.go +++ b/pkg/agent/executor/exec_neonvm.go @@ -6,13 +6,19 @@ import ( "go.uber.org/zap" + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "github.com/neondatabase/autoscaling/pkg/agent/core" "github.com/neondatabase/autoscaling/pkg/api" "github.com/neondatabase/autoscaling/pkg/util" ) type NeonVMInterface interface { - Request(_ context.Context, _ *zap.Logger, current, target api.Resources) error + Request( + _ context.Context, + _ *zap.Logger, + current, target api.Resources, + desiredLogicalTime *vmv1.LogicalTime, + ) error } func (c *ExecutorCoreWithClients) DoNeonVMRequests(ctx context.Context, logger *zap.Logger) { @@ -46,7 +52,7 @@ func (c *ExecutorCoreWithClients) DoNeonVMRequests(ctx context.Context, logger * continue // state has changed, retry. } - err := c.clients.NeonVM.Request(ctx, ifaceLogger, action.Current, action.Target) + err := c.clients.NeonVM.Request(ctx, ifaceLogger, action.Current, action.Target, action.DesiredLogicalTime) endTime := time.Now() logFields := []zap.Field{zap.Object("action", action), zap.Duration("duration", endTime.Sub(startTime))} diff --git a/pkg/agent/executor/exec_plugin.go b/pkg/agent/executor/exec_plugin.go index 66bcf5dc0..2f4777d3f 100644 --- a/pkg/agent/executor/exec_plugin.go +++ b/pkg/agent/executor/exec_plugin.go @@ -64,6 +64,7 @@ func (c *ExecutorCoreWithClients) DoPluginRequests(ctx context.Context, logger * if err := state.Plugin().RequestSuccessful(endTime, *resp); err != nil { logger.Error("Plugin response validation failed", append(logFields, zap.Error(err))...) } + state.Plugin().UpdateLogicalTime(action.DesiredLogicalTime.Rewind(endTime)) } }) } diff --git a/pkg/agent/prommetrics.go b/pkg/agent/prommetrics.go index 4a004351c..fe480ea40 100644 --- a/pkg/agent/prommetrics.go +++ b/pkg/agent/prommetrics.go @@ -26,6 +26,8 @@ type GlobalMetrics struct { runnerStarts prometheus.Counter runnerRestarts prometheus.Counter runnerNextActions prometheus.Counter + + scalingLatency prometheus.Histogram } type resourceChangePair struct { @@ -217,6 +219,13 @@ func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { Help: "Number of times (*core.State).NextActions() has been called", }, )), + + scalingLatency: util.RegisterMetric(reg, prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "autoscaling_agent_scaling_latency_seconds", + Help: "End-to-end scaling latency", + }, + )), } // Some of of the metrics should have default keys set to zero. Otherwise, these won't be filled diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go index 085ccad63..b3a7c6b8f 100644 --- a/pkg/agent/runner.go +++ b/pkg/agent/runner.go @@ -31,7 +31,9 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ktypes "k8s.io/apimachinery/pkg/types" + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "github.com/neondatabase/autoscaling/pkg/agent/core" + "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" "github.com/neondatabase/autoscaling/pkg/agent/executor" "github.com/neondatabase/autoscaling/pkg/agent/schedwatch" "github.com/neondatabase/autoscaling/pkg/api" @@ -194,6 +196,9 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util pluginRequestJitter := util.NewTimeRange(time.Millisecond, 0, 100).Random() coreExecLogger := execLogger.Named("core") + clock := logiclock.NewClock(func(duration time.Duration) { + r.global.metrics.scalingLatency.Observe(duration.Seconds()) + }) executorCore := executor.NewExecutorCore(coreExecLogger, getVmInfo(), executor.Config{ OnNextActions: r.global.metrics.runnerNextActions.Inc, Core: core.Config{ @@ -211,7 +216,7 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util Warn: coreExecLogger.Warn, }, }, - }) + }, clock) r.executorStateDump = executorCore.StateDump @@ -625,7 +630,11 @@ func doMetricsRequest( return nil } -func (r *Runner) doNeonVMRequest(ctx context.Context, target api.Resources) error { +func (r *Runner) doNeonVMRequest( + ctx context.Context, + target api.Resources, + desiredLogicalTime *vmv1.LogicalTime, +) error { patches := []patch.Operation{{ Op: patch.OpReplace, Path: "/spec/guest/cpus/use", @@ -634,8 +643,14 @@ func (r *Runner) doNeonVMRequest(ctx context.Context, target api.Resources) erro Op: patch.OpReplace, Path: "/spec/guest/memorySlots/use", Value: uint32(target.Mem / r.memSlotSize), + }, { + Op: patch.OpReplace, + Path: "/spec/guest/desiredLogicalTime", + Value: desiredLogicalTime, }} + fmt.Printf("Desired Logical Time: %v\n", desiredLogicalTime) + patchPayload, err := json.Marshal(patches) if err != nil { panic(fmt.Errorf("Error marshalling JSON patch: %w", err)) diff --git a/pkg/api/vminfo.go b/pkg/api/vminfo.go index d5dc8fdf6..966b94fb9 100644 --- a/pkg/api/vminfo.go +++ b/pkg/api/vminfo.go @@ -53,11 +53,12 @@ func HasAlwaysMigrateLabel(obj metav1.ObjectMetaAccessor) bool { // care about. It takes various labels and annotations into account, so certain fields might be // different from what's strictly in the VirtualMachine object. type VmInfo struct { - Name string `json:"name"` - Namespace string `json:"namespace"` - Cpu VmCpuInfo `json:"cpu"` - Mem VmMemInfo `json:"mem"` - Config VmConfig `json:"config"` + Name string `json:"name"` + Namespace string `json:"namespace"` + Cpu VmCpuInfo `json:"cpu"` + Mem VmMemInfo `json:"mem"` + Config VmConfig `json:"config"` + CurrentLogicalTime *vmapi.LogicalTime `json:"currentLogicalTime,omitempty"` } type VmCpuInfo struct { @@ -150,7 +151,7 @@ func (vm VmInfo) NamespacedName() util.NamespacedName { func ExtractVmInfo(logger *zap.Logger, vm *vmapi.VirtualMachine) (*VmInfo, error) { logger = logger.With(util.VMNameFields(vm)) - return extractVmInfoGeneric(logger, vm.Name, vm, vm.Spec.Resources()) + return extractVmInfoGeneric(logger, vm.Name, vm, vm.Spec.Resources(), vm.Status.CurrentLogicalTime) } func ExtractVmInfoFromPod(logger *zap.Logger, pod *corev1.Pod) (*VmInfo, error) { @@ -164,7 +165,7 @@ func ExtractVmInfoFromPod(logger *zap.Logger, pod *corev1.Pod) (*VmInfo, error) } vmName := pod.Labels[vmapi.VirtualMachineNameLabel] - return extractVmInfoGeneric(logger, vmName, pod, resources) + return extractVmInfoGeneric(logger, vmName, pod, resources, nil) } func extractVmInfoGeneric( @@ -172,6 +173,7 @@ func extractVmInfoGeneric( vmName string, obj metav1.ObjectMetaAccessor, resources vmapi.VirtualMachineResources, + currentClock *vmapi.LogicalTime, ) (*VmInfo, error) { cpuInfo := NewVmCpuInfo(resources.CPUs) memInfo := NewVmMemInfo(resources.MemorySlots, resources.MemorySlotSize) @@ -191,6 +193,7 @@ func extractVmInfoGeneric( ScalingEnabled: scalingEnabled, ScalingConfig: nil, // set below, maybe }, + CurrentLogicalTime: currentClock, } if boundsJSON, ok := obj.GetObjectMeta().GetAnnotations()[AnnotationAutoscalingBounds]; ok { diff --git a/pkg/plugin/run.go b/pkg/plugin/run.go index b49da078c..36aa47d15 100644 --- a/pkg/plugin/run.go +++ b/pkg/plugin/run.go @@ -309,7 +309,12 @@ func (e *AutoscaleEnforcer) handleResources( }), ) - return api.Resources{VCPU: pod.cpu.Reserved, Mem: pod.mem.Reserved}, 200, nil + result := api.Resources{ + VCPU: pod.cpu.Reserved, + Mem: pod.mem.Reserved, + } + + return result, 200, nil } func (e *AutoscaleEnforcer) updateMetricsAndCheckMustMigrate( From 628552a5192a1421a3f13bc619b87f6a0f218b99 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 4 Jul 2024 13:08:31 +0400 Subject: [PATCH 04/57] generate CRD Signed-off-by: Oleg Vasilev --- .../apis/neonvm/v1/zz_generated.deepcopy.go | 26 +++++++++++++++++ .../bases/vm.neon.tech_virtualmachines.yaml | 28 +++++++++++++++++++ pkg/agent/core/state.go | 4 +++ 3 files changed, 58 insertions(+) diff --git a/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go b/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go index 46f3499e5..a41843f10 100644 --- a/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go +++ b/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go @@ -186,6 +186,11 @@ func (in *Guest) DeepCopyInto(out *Guest) { *out = make([]Port, len(*in)) copy(*out, *in) } + if in.DesiredLogicalTime != nil { + in, out := &in.DesiredLogicalTime, &out.DesiredLogicalTime + *out = new(LogicalTime) + (*in).DeepCopyInto(*out) + } if in.Settings != nil { in, out := &in.Settings, &out.Settings *out = new(GuestSettings) @@ -328,6 +333,22 @@ func (in *IPPoolSpec) DeepCopy() *IPPoolSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *LogicalTime) DeepCopyInto(out *LogicalTime) { + *out = *in + in.UpdatedAt.DeepCopyInto(&out.UpdatedAt) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LogicalTime. +func (in *LogicalTime) DeepCopy() *LogicalTime { + if in == nil { + return nil + } + out := new(LogicalTime) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MemorySlots) DeepCopyInto(out *MemorySlots) { *out = *in @@ -760,6 +781,11 @@ func (in *VirtualMachineStatus) DeepCopyInto(out *VirtualMachineStatus) { *out = new(MemoryProvider) **out = **in } + if in.CurrentLogicalTime != nil { + in, out := &in.CurrentLogicalTime, &out.CurrentLogicalTime + *out = new(LogicalTime) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineStatus. diff --git a/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml b/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml index 5f3af1dad..45d864501 100644 --- a/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml +++ b/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml @@ -2397,6 +2397,20 @@ spec: - min - use type: object + desiredLogicalTime: + description: Logical clock value corresponding to the desired + resources of the VM. + properties: + updatedAt: + format: date-time + type: string + value: + format: int64 + type: integer + required: + - updatedAt + - value + type: object env: description: List of environment variables to set in the vmstart process. @@ -2771,6 +2785,20 @@ spec: pattern: ^[0-9]+((\.[0-9]*)?|m) type: integer x-kubernetes-int-or-string: true + currentLogicalTime: + description: LogicalTime allows to track progress of changes to a + VM. + properties: + updatedAt: + format: date-time + type: string + value: + format: int64 + type: integer + required: + - updatedAt + - value + type: object extraNetIP: type: string extraNetMask: diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index f6fa38be5..c7d4d985a 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -275,6 +275,8 @@ func (s *state) nextActions(now time.Time) ActionSet { } desiredLogicalTime := s.ClockSource.Next(now) + fmt.Printf("new desired time: %v\n", desiredLogicalTime) + // ---- // Requests to the scheduler plugin: var pluginRequiredWait *time.Duration @@ -472,6 +474,8 @@ func (s *state) calculateNeonVMAction( desiredTime := vmv1.EarliestLogicalTime(desiredTimeCandidates...) + fmt.Printf("Neonvm desired time: %v\n", desiredTime) + // clamp desiredResources to what we're allowed to make a request for desiredResources = s.clampResources( s.VM.Using(), // current: what we're using already From d5f67983371e68fbd20e4b61a693ce25d4cb9879 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 4 Jul 2024 14:37:28 +0400 Subject: [PATCH 05/57] add kind Signed-off-by: Oleg Vasilev --- pkg/agent/core/logiclock/logiclock.go | 47 ++++++++++++++-------- pkg/agent/core/logiclock/logiclock_test.go | 12 +++--- pkg/agent/core/state.go | 12 +++++- pkg/agent/prommetrics.go | 6 +-- pkg/agent/runner.go | 5 ++- 5 files changed, 54 insertions(+), 28 deletions(-) diff --git a/pkg/agent/core/logiclock/logiclock.go b/pkg/agent/core/logiclock/logiclock.go index 24d34c25e..6afa6adff 100644 --- a/pkg/agent/core/logiclock/logiclock.go +++ b/pkg/agent/core/logiclock/logiclock.go @@ -9,30 +9,45 @@ import ( vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" ) +type Kind string + +const ( + KindUpscale Kind = "upscale" + KindDownscale Kind = "downscale" +) + +type measurement struct { + createdAt time.Time + kind Kind +} + type Clock struct { - cb func(time.Duration) - times []time.Time - offset int64 + cb func(time.Duration, Kind) + measurements []measurement + offset int64 } -func NewClock(cb func(time.Duration)) *Clock { +func NewClock(cb func(time.Duration, Kind)) *Clock { return &Clock{ - cb: cb, - times: nil, - offset: 0, + cb: cb, + measurements: nil, + offset: 0, } } func (c *Clock) NextValue() int64 { - return c.offset + int64(len(c.times)) + return c.offset + int64(len(c.measurements)) } -func (c *Clock) Next(now time.Time) *vmv1.LogicalTime { +func (c *Clock) Next(now time.Time, kind Kind) *vmv1.LogicalTime { ret := vmv1.LogicalTime{ Value: c.NextValue(), UpdatedAt: v1.NewTime(now), } - c.times = append(c.times, ret.UpdatedAt.Time) + c.measurements = append(c.measurements, measurement{ + createdAt: ret.UpdatedAt.Time, + kind: kind, + }) return &ret } @@ -45,23 +60,23 @@ func (c *Clock) Observe(logicalTime *vmv1.LogicalTime) error { } idx := logicalTime.Value - c.offset - if idx > int64(len(c.times)) { + if idx > int64(len(c.measurements)) { return errors.New("logicalTime value is in the future") } - diff := logicalTime.UpdatedAt.Time.Sub(c.times[idx]) + diff := logicalTime.UpdatedAt.Time.Sub(c.measurements[idx].createdAt) if c.cb != nil { - c.cb(diff) + c.cb(diff, c.measurements[idx].kind) } c.offset = logicalTime.Value + 1 - c.times = c.times[idx+1:] + c.measurements = c.measurements[idx+1:] return nil } type NilClock struct{} -func (c *NilClock) Next(now time.Time) *vmv1.LogicalTime { return nil } -func (c *NilClock) Observe(_ *vmv1.LogicalTime) error { return nil } +func (c *NilClock) Next(now time.Time, _ Kind) *vmv1.LogicalTime { return nil } +func (c *NilClock) Observe(_ *vmv1.LogicalTime) error { return nil } diff --git a/pkg/agent/core/logiclock/logiclock_test.go b/pkg/agent/core/logiclock/logiclock_test.go index e8ec9ca80..a380002d8 100644 --- a/pkg/agent/core/logiclock/logiclock_test.go +++ b/pkg/agent/core/logiclock/logiclock_test.go @@ -15,9 +15,10 @@ import ( type testClockMetric struct { *logiclock.Clock - t *testing.T - now v1.Time - result *time.Duration + t *testing.T + now v1.Time + result *time.Duration + resultKind logiclock.Kind } func (tcm *testClockMetric) advance(d time.Duration) { @@ -31,7 +32,7 @@ func (tcm *testClockMetric) assertResult(d time.Duration) { } func (tcm *testClockMetric) nextNow() *vmv1.LogicalTime { - return tcm.Next(tcm.now.Time) + return tcm.Next(tcm.now.Time, logiclock.KindUpscale) } func newTestClockMetric(t *testing.T) *testClockMetric { @@ -42,8 +43,9 @@ func newTestClockMetric(t *testing.T) *testClockMetric { result: nil, } - cb := func(d time.Duration) { + cb := func(d time.Duration, kind logiclock.Kind) { tcm.result = &d + tcm.resultKind = kind } tcm.Clock = logiclock.NewClock(cb) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index c7d4d985a..34a8bae92 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -23,6 +23,7 @@ package core import ( "errors" "fmt" + "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" "math" "strings" "time" @@ -204,7 +205,7 @@ func (ns *neonvmState) ongoingRequest() bool { } type LogicClock interface { - Next(ts time.Time) *vmv1.LogicalTime + Next(ts time.Time, kind logiclock.Kind) *vmv1.LogicalTime Observe(logicalTime *vmv1.LogicalTime) error } @@ -273,7 +274,14 @@ func (s *state) nextActions(now time.Time) ActionSet { // our handling later on is easier if we can assume it's non-nil calcDesiredResourcesWait = func(ActionSet) *time.Duration { return nil } } - desiredLogicalTime := s.ClockSource.Next(now) + var kind logiclock.Kind + if desiredResources.HasFieldLessThan(s.VM.Using()) { + kind = logiclock.KindDownscale + } else { + kind = logiclock.KindUpscale + } + + desiredLogicalTime := s.ClockSource.Next(now, kind) fmt.Printf("new desired time: %v\n", desiredLogicalTime) diff --git a/pkg/agent/prommetrics.go b/pkg/agent/prommetrics.go index fe480ea40..a525ba981 100644 --- a/pkg/agent/prommetrics.go +++ b/pkg/agent/prommetrics.go @@ -27,7 +27,7 @@ type GlobalMetrics struct { runnerRestarts prometheus.Counter runnerNextActions prometheus.Counter - scalingLatency prometheus.Histogram + scalingLatency prometheus.HistogramVec } type resourceChangePair struct { @@ -220,11 +220,11 @@ func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { }, )), - scalingLatency: util.RegisterMetric(reg, prometheus.NewHistogram( + scalingLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "autoscaling_agent_scaling_latency_seconds", Help: "End-to-end scaling latency", - }, + }, []string{"kind"}, )), } diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go index b3a7c6b8f..04b6f637a 100644 --- a/pkg/agent/runner.go +++ b/pkg/agent/runner.go @@ -196,8 +196,9 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util pluginRequestJitter := util.NewTimeRange(time.Millisecond, 0, 100).Random() coreExecLogger := execLogger.Named("core") - clock := logiclock.NewClock(func(duration time.Duration) { - r.global.metrics.scalingLatency.Observe(duration.Seconds()) + clock := logiclock.NewClock(func(duration time.Duration, kind logiclock.Kind) { + labels := []string{string(kind)} + r.global.metrics.scalingLatency.WithLabelValues(labels...).Observe(duration.Seconds()) }) executorCore := executor.NewExecutorCore(coreExecLogger, getVmInfo(), executor.Config{ OnNextActions: r.global.metrics.runnerNextActions.Inc, From 3af9d6233fe580dd8401427cf5a50be6d58a9b96 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Fri, 5 Jul 2024 14:28:06 +0400 Subject: [PATCH 06/57] fix lint Signed-off-by: Oleg Vasilev --- pkg/agent/core/logiclock/logiclock_test.go | 13 +++++++------ pkg/agent/core/state.go | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pkg/agent/core/logiclock/logiclock_test.go b/pkg/agent/core/logiclock/logiclock_test.go index a380002d8..631dba287 100644 --- a/pkg/agent/core/logiclock/logiclock_test.go +++ b/pkg/agent/core/logiclock/logiclock_test.go @@ -18,7 +18,7 @@ type testClockMetric struct { t *testing.T now v1.Time result *time.Duration - resultKind logiclock.Kind + resultKind *logiclock.Kind } func (tcm *testClockMetric) advance(d time.Duration) { @@ -37,15 +37,16 @@ func (tcm *testClockMetric) nextNow() *vmv1.LogicalTime { func newTestClockMetric(t *testing.T) *testClockMetric { tcm := &testClockMetric{ - Clock: nil, - t: t, - now: v1.NewTime(time.Now()), - result: nil, + Clock: nil, + t: t, + now: v1.NewTime(time.Now()), + result: nil, + resultKind: nil, } cb := func(d time.Duration, kind logiclock.Kind) { tcm.result = &d - tcm.resultKind = kind + tcm.resultKind = &kind } tcm.Clock = logiclock.NewClock(cb) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 34a8bae92..ab35b7298 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -23,7 +23,6 @@ package core import ( "errors" "fmt" - "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" "math" "strings" "time" @@ -32,6 +31,7 @@ import ( "go.uber.org/zap" vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" + "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" "github.com/neondatabase/autoscaling/pkg/api" "github.com/neondatabase/autoscaling/pkg/util" ) From 70fef0b139df7614803c24fd8f1c6f275da3c59c Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 8 Jul 2024 14:19:01 +0400 Subject: [PATCH 07/57] replace kind with flags Signed-off-by: Oleg Vasilev --- pkg/agent/core/dumpstate.go | 17 +++--- pkg/agent/core/logiclock/logiclock.go | 35 ++++++++---- pkg/agent/core/logiclock/logiclock_test.go | 6 +-- pkg/agent/core/state.go | 62 +++++++++++++--------- pkg/agent/prommetrics.go | 2 +- pkg/agent/runner.go | 13 ++++- 6 files changed, 84 insertions(+), 51 deletions(-) diff --git a/pkg/agent/core/dumpstate.go b/pkg/agent/core/dumpstate.go index ca862c4da..8a3bc8ad4 100644 --- a/pkg/agent/core/dumpstate.go +++ b/pkg/agent/core/dumpstate.go @@ -34,14 +34,15 @@ func (d StateDump) MarshalJSON() ([]byte, error) { func (s *State) Dump() StateDump { return StateDump{ internal: state{ - Debug: s.internal.Debug, - Config: s.internal.Config, - VM: s.internal.VM, - Plugin: s.internal.Plugin.deepCopy(), - Monitor: s.internal.Monitor.deepCopy(), - NeonVM: s.internal.NeonVM.deepCopy(), - Metrics: shallowCopy[SystemMetrics](s.internal.Metrics), - ClockSource: s.internal.ClockSource, + Debug: s.internal.Debug, + Config: s.internal.Config, + VM: s.internal.VM, + Plugin: s.internal.Plugin.deepCopy(), + Monitor: s.internal.Monitor.deepCopy(), + NeonVM: s.internal.NeonVM.deepCopy(), + Metrics: shallowCopy[SystemMetrics](s.internal.Metrics), + ClockSource: s.internal.ClockSource, + DesiredLogicalTime: s.internal.DesiredLogicalTime, }, } } diff --git a/pkg/agent/core/logiclock/logiclock.go b/pkg/agent/core/logiclock/logiclock.go index 6afa6adff..5eb9e5276 100644 --- a/pkg/agent/core/logiclock/logiclock.go +++ b/pkg/agent/core/logiclock/logiclock.go @@ -9,25 +9,38 @@ import ( vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" ) -type Kind string +type Flag uint64 const ( - KindUpscale Kind = "upscale" - KindDownscale Kind = "downscale" + Upscale Flag = 1 << iota + Downscale + Immediate ) +func (f *Flag) Set(flag Flag) { + *f |= flag +} + +func (f *Flag) Clear(flag Flag) { + *f &= ^flag +} + +func (f Flag) Has(flag Flag) bool { + return f&flag != 0 +} + type measurement struct { createdAt time.Time - kind Kind + flags Flag } type Clock struct { - cb func(time.Duration, Kind) + cb func(time.Duration, Flag) measurements []measurement offset int64 } -func NewClock(cb func(time.Duration, Kind)) *Clock { +func NewClock(cb func(time.Duration, Flag)) *Clock { return &Clock{ cb: cb, measurements: nil, @@ -39,14 +52,14 @@ func (c *Clock) NextValue() int64 { return c.offset + int64(len(c.measurements)) } -func (c *Clock) Next(now time.Time, kind Kind) *vmv1.LogicalTime { +func (c *Clock) Next(now time.Time, flags Flag) *vmv1.LogicalTime { ret := vmv1.LogicalTime{ Value: c.NextValue(), UpdatedAt: v1.NewTime(now), } c.measurements = append(c.measurements, measurement{ createdAt: ret.UpdatedAt.Time, - kind: kind, + flags: flags, }) return &ret } @@ -67,7 +80,7 @@ func (c *Clock) Observe(logicalTime *vmv1.LogicalTime) error { diff := logicalTime.UpdatedAt.Time.Sub(c.measurements[idx].createdAt) if c.cb != nil { - c.cb(diff, c.measurements[idx].kind) + c.cb(diff, c.measurements[idx].flags) } c.offset = logicalTime.Value + 1 @@ -78,5 +91,5 @@ func (c *Clock) Observe(logicalTime *vmv1.LogicalTime) error { type NilClock struct{} -func (c *NilClock) Next(now time.Time, _ Kind) *vmv1.LogicalTime { return nil } -func (c *NilClock) Observe(_ *vmv1.LogicalTime) error { return nil } +func (c *NilClock) Next(_ time.Time, _ Flag) *vmv1.LogicalTime { return nil } +func (c *NilClock) Observe(_ *vmv1.LogicalTime) error { return nil } diff --git a/pkg/agent/core/logiclock/logiclock_test.go b/pkg/agent/core/logiclock/logiclock_test.go index 631dba287..d5e82c899 100644 --- a/pkg/agent/core/logiclock/logiclock_test.go +++ b/pkg/agent/core/logiclock/logiclock_test.go @@ -18,7 +18,7 @@ type testClockMetric struct { t *testing.T now v1.Time result *time.Duration - resultKind *logiclock.Kind + resultKind *logiclock.Flag } func (tcm *testClockMetric) advance(d time.Duration) { @@ -32,7 +32,7 @@ func (tcm *testClockMetric) assertResult(d time.Duration) { } func (tcm *testClockMetric) nextNow() *vmv1.LogicalTime { - return tcm.Next(tcm.now.Time, logiclock.KindUpscale) + return tcm.Next(tcm.now.Time, logiclock.Upscale) } func newTestClockMetric(t *testing.T) *testClockMetric { @@ -44,7 +44,7 @@ func newTestClockMetric(t *testing.T) *testClockMetric { resultKind: nil, } - cb := func(d time.Duration, kind logiclock.Kind) { + cb := func(d time.Duration, kind logiclock.Flag) { tcm.result = &d tcm.resultKind = &kind } diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index ab35b7298..733e6a292 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -117,7 +117,8 @@ type state struct { Metrics *SystemMetrics - ClockSource LogicClock `json:"-"` + ClockSource LogicClock `json:"-"` + DesiredLogicalTime *vmv1.LogicalTime } type pluginState struct { @@ -205,7 +206,7 @@ func (ns *neonvmState) ongoingRequest() bool { } type LogicClock interface { - Next(ts time.Time, kind logiclock.Kind) *vmv1.LogicalTime + Next(ts time.Time, kind logiclock.Flag) *vmv1.LogicalTime Observe(logicalTime *vmv1.LogicalTime) error } @@ -236,8 +237,9 @@ func NewState(vm api.VmInfo, config Config, clockSource LogicClock) *State { OngoingRequested: nil, RequestFailedAt: nil, }, - Metrics: nil, - ClockSource: clockSource, + Metrics: nil, + ClockSource: clockSource, + DesiredLogicalTime: nil, }, } @@ -274,21 +276,11 @@ func (s *state) nextActions(now time.Time) ActionSet { // our handling later on is easier if we can assume it's non-nil calcDesiredResourcesWait = func(ActionSet) *time.Duration { return nil } } - var kind logiclock.Kind - if desiredResources.HasFieldLessThan(s.VM.Using()) { - kind = logiclock.KindDownscale - } else { - kind = logiclock.KindUpscale - } - - desiredLogicalTime := s.ClockSource.Next(now, kind) - - fmt.Printf("new desired time: %v\n", desiredLogicalTime) // ---- // Requests to the scheduler plugin: var pluginRequiredWait *time.Duration - actions.PluginRequest, pluginRequiredWait = s.calculatePluginAction(now, desiredResources, desiredLogicalTime) + actions.PluginRequest, pluginRequiredWait = s.calculatePluginAction(now, desiredResources, s.DesiredLogicalTime) // ---- // Requests to NeonVM: @@ -302,7 +294,7 @@ func (s *state) nextActions(now time.Time) ActionSet { pluginRequestedPhase = "planned" } var neonvmRequiredWait *time.Duration - actions.NeonVMRequest, neonvmRequiredWait = s.calculateNeonVMAction(now, desiredResources, pluginRequested, pluginRequestedPhase, desiredLogicalTime) + actions.NeonVMRequest, neonvmRequiredWait = s.calculateNeonVMAction(now, desiredResources, pluginRequested, pluginRequestedPhase, s.DesiredLogicalTime) // ---- // Requests to vm-monitor (upscaling) @@ -311,7 +303,7 @@ func (s *state) nextActions(now time.Time) ActionSet { // forego notifying the vm-monitor of increased resources because we were busy asking if it // could downscale. var monitorUpscaleRequiredWait *time.Duration - actions.MonitorUpscale, monitorUpscaleRequiredWait = s.calculateMonitorUpscaleAction(now, desiredResources, desiredLogicalTime) + actions.MonitorUpscale, monitorUpscaleRequiredWait = s.calculateMonitorUpscaleAction(now, desiredResources, s.DesiredLogicalTime) // ---- // Requests to vm-monitor (downscaling) @@ -322,7 +314,7 @@ func (s *state) nextActions(now time.Time) ActionSet { now, desiredResources, plannedUpscale, - desiredLogicalTime, + s.DesiredLogicalTime, ) // --- and that's all the request types! --- @@ -850,7 +842,7 @@ func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) ( return nil } } - + s.updateDesiredClock(now, result, s.VM.Using()) s.updateCurrentClock(s.VM.CurrentLogicalTime) s.info("Calculated desired resources", zap.Object("current", s.VM.Using()), zap.Object("target", result)) @@ -858,6 +850,31 @@ func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) ( return result, calculateWaitTime } +func (s *state) updateDesiredClock( + now time.Time, + desired api.Resources, + current api.Resources, +) { + var flags logiclock.Flag + if desired.HasFieldGreaterThan(current) { + flags.Set(logiclock.Upscale) + } + if desired.HasFieldLessThan(current) { + flags.Set(logiclock.Downscale) + } + + s.DesiredLogicalTime = s.ClockSource.Next(now, flags) + + fmt.Printf("new desired time: %v\n", s.DesiredLogicalTime) +} + +func (s *state) updateCurrentClock(logicalTime *vmv1.LogicalTime) { + err := s.ClockSource.Observe(logicalTime) + if err != nil { + s.warnf("Failed to observe clock source: %v", err) + } +} + func (s *state) timeUntilRequestedUpscalingExpired(now time.Time) time.Duration { if s.Monitor.RequestedUpscale != nil { return s.Monitor.RequestedUpscale.At.Add(s.Config.MonitorRequestedUpscaleValidPeriod).Sub(now) @@ -972,13 +989,6 @@ func (s *state) pluginApprovedUpperBound() api.Resources { } } -func (s *state) updateCurrentClock(logicalTime *vmv1.LogicalTime) { - err := s.ClockSource.Observe(logicalTime) - if err != nil { - s.warnf("Failed to observe clock source: %v", err) - } -} - ////////////////////////////////////////// // PUBLIC FUNCTIONS TO UPDATE THE STATE // ////////////////////////////////////////// diff --git a/pkg/agent/prommetrics.go b/pkg/agent/prommetrics.go index a525ba981..dcbaae709 100644 --- a/pkg/agent/prommetrics.go +++ b/pkg/agent/prommetrics.go @@ -224,7 +224,7 @@ func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { prometheus.HistogramOpts{ Name: "autoscaling_agent_scaling_latency_seconds", Help: "End-to-end scaling latency", - }, []string{"kind"}, + }, []string{"upscale", "downscale", "immediate"}, )), } diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go index 04b6f637a..daa5dad8a 100644 --- a/pkg/agent/runner.go +++ b/pkg/agent/runner.go @@ -196,8 +196,17 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util pluginRequestJitter := util.NewTimeRange(time.Millisecond, 0, 100).Random() coreExecLogger := execLogger.Named("core") - clock := logiclock.NewClock(func(duration time.Duration, kind logiclock.Kind) { - labels := []string{string(kind)} + clock := logiclock.NewClock(func(duration time.Duration, flag logiclock.Flag) { + labels := []string{"false", "false", "false"} + if flag.Has(logiclock.Upscale) { + labels[0] = "true" + } + if flag.Has(logiclock.Downscale) { + labels[1] = "true" + } + if flag.Has(logiclock.Immediate) { + labels[2] = "true" + } r.global.metrics.scalingLatency.WithLabelValues(labels...).Observe(duration.Seconds()) }) executorCore := executor.NewExecutorCore(coreExecLogger, getVmInfo(), executor.Config{ From 20be2d9fc505b5a63a2c6d0be21f38f96a6da982 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 8 Jul 2024 14:21:39 +0400 Subject: [PATCH 08/57] finish the implementation Signed-off-by: Oleg Vasilev --- pkg/agent/core/state.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 733e6a292..f7c8a840b 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -842,7 +842,7 @@ func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) ( return nil } } - s.updateDesiredClock(now, result, s.VM.Using()) + s.updateDesiredClock(now, result, s.VM.Using(), requestedUpscalingAffectedResult) s.updateCurrentClock(s.VM.CurrentLogicalTime) s.info("Calculated desired resources", zap.Object("current", s.VM.Using()), zap.Object("target", result)) @@ -854,6 +854,7 @@ func (s *state) updateDesiredClock( now time.Time, desired api.Resources, current api.Resources, + immediate bool, ) { var flags logiclock.Flag if desired.HasFieldGreaterThan(current) { @@ -862,6 +863,9 @@ func (s *state) updateDesiredClock( if desired.HasFieldLessThan(current) { flags.Set(logiclock.Downscale) } + if immediate { + flags.Set(logiclock.Immediate) + } s.DesiredLogicalTime = s.ClockSource.Next(now, flags) From fc2e3d62c5ddb44b617b222d641ced995f463db4 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 8 Jul 2024 14:23:39 +0400 Subject: [PATCH 09/57] fix earliest Signed-off-by: Oleg Vasilev --- neonvm/apis/neonvm/v1/virtualmachine_types.go | 20 +++++++++---------- pkg/agent/core/state.go | 8 +++----- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/neonvm/apis/neonvm/v1/virtualmachine_types.go b/neonvm/apis/neonvm/v1/virtualmachine_types.go index 144e7ff8b..8c744c82f 100644 --- a/neonvm/apis/neonvm/v1/virtualmachine_types.go +++ b/neonvm/apis/neonvm/v1/virtualmachine_types.go @@ -240,17 +240,17 @@ func (t *LogicalTime) RewindNow() *LogicalTime { return t.Rewind(time.Now()) } -func EarliestLogicalTime(ts ...*LogicalTime) *LogicalTime { - var earliest *LogicalTime - for _, t := range ts { - if t == nil { - return nil - } - if earliest == nil || t.UpdatedAt.Before(&earliest.UpdatedAt) { - earliest = t - } +func (t *LogicalTime) Earliest(other *LogicalTime) *LogicalTime { + if t == nil { + return other + } + if other == nil { + return t + } + if t.UpdatedAt.Before(&other.UpdatedAt) { + return t } - return earliest + return other } type GuestSettings struct { diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index f7c8a840b..5e4fabbf3 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -460,20 +460,18 @@ func (s *state) calculateNeonVMAction( logicalTime *vmv1.LogicalTime, ) (*ActionNeonVMRequest, *time.Duration) { - desiredTimeCandidates := []*vmv1.LogicalTime{logicalTime} + desiredTime := logicalTime if desiredResources.HasFieldLessThan(s.VM.Using()) { // We are downscaling, so we needed a permit from monitor - desiredTimeCandidates = append(desiredTimeCandidates, s.Monitor.CurrentLogicalTime) + desiredTime = desiredTime.Earliest(s.Monitor.CurrentLogicalTime) } if desiredResources.HasFieldGreaterThan(s.VM.Using()) { // We are upscaling, so we needed a permit from the plugin - desiredTimeCandidates = append(desiredTimeCandidates, s.Plugin.CurrentLogicalTime) + desiredTime = desiredTime.Earliest(s.Plugin.CurrentLogicalTime) } - desiredTime := vmv1.EarliestLogicalTime(desiredTimeCandidates...) - fmt.Printf("Neonvm desired time: %v\n", desiredTime) // clamp desiredResources to what we're allowed to make a request for From e6b714469d9c9fb4df920dd14754abdeda4bda6c Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 8 Jul 2024 14:31:46 +0400 Subject: [PATCH 10/57] fix tests Signed-off-by: Oleg Vasilev --- pkg/agent/core/state.go | 4 ---- pkg/agent/core/state_test.go | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 5e4fabbf3..ad32d093b 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -472,8 +472,6 @@ func (s *state) calculateNeonVMAction( desiredTime = desiredTime.Earliest(s.Plugin.CurrentLogicalTime) } - fmt.Printf("Neonvm desired time: %v\n", desiredTime) - // clamp desiredResources to what we're allowed to make a request for desiredResources = s.clampResources( s.VM.Using(), // current: what we're using already @@ -866,8 +864,6 @@ func (s *state) updateDesiredClock( } s.DesiredLogicalTime = s.ClockSource.Next(now, flags) - - fmt.Printf("new desired time: %v\n", s.DesiredLogicalTime) } func (s *state) updateCurrentClock(logicalTime *vmv1.LogicalTime) { diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 6ad3a68fa..3ee52345a 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -284,7 +284,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { a.Call(getDesiredResources, state, clock.Now()). Equals(resForCU(2)) - lt := logicalTime(clock, 1) + lt := logicalTime(clock, 2) // Now that the initial scheduler request is done, and we have metrics that indicate // scale-up would be a good idea, we should be contacting the scheduler to get approval. @@ -329,7 +329,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { a.Do(state.NeonVM().RequestSuccessful, clock.Now()) state.UpdateCurrentClock(lt) - lt = logicalTime(clock, 5) + lt = logicalTime(clock, 6) // NeonVM change is done, now we should finish by notifying the vm-monitor a.Call(nextActions).Equals(core.ActionSet{ @@ -370,7 +370,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { a.Call(getDesiredResources, state, clock.Now()). Equals(resForCU(1)) - lt = logicalTime(clock, 8) + lt = logicalTime(clock, 10) // First step in downscaling is getting approval from the vm-monitor: a.Call(nextActions).Equals(core.ActionSet{ @@ -407,7 +407,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { }) a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - lt = logicalTime(clock, 12) + lt = logicalTime(clock, 14) // Request to NeonVM completed, it's time to inform the scheduler plugin: a.Call(nextActions).Equals(core.ActionSet{ From 0e1ab3bd17abce2c66ad9461d55b2d149eadc024 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 8 Jul 2024 14:36:45 +0400 Subject: [PATCH 11/57] self-review changes Signed-off-by: Oleg Vasilev --- pkg/agent/core/state.go | 2 +- pkg/agent/runner.go | 2 -- pkg/plugin/run.go | 7 +------ 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index ad32d093b..6ba2bcbfd 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -463,7 +463,7 @@ func (s *state) calculateNeonVMAction( desiredTime := logicalTime if desiredResources.HasFieldLessThan(s.VM.Using()) { - // We are downscaling, so we needed a permit from monitor + // We are downscaling, so we needed a permit from the¡ monitor desiredTime = desiredTime.Earliest(s.Monitor.CurrentLogicalTime) } diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go index daa5dad8a..2b4fe1a47 100644 --- a/pkg/agent/runner.go +++ b/pkg/agent/runner.go @@ -659,8 +659,6 @@ func (r *Runner) doNeonVMRequest( Value: desiredLogicalTime, }} - fmt.Printf("Desired Logical Time: %v\n", desiredLogicalTime) - patchPayload, err := json.Marshal(patches) if err != nil { panic(fmt.Errorf("Error marshalling JSON patch: %w", err)) diff --git a/pkg/plugin/run.go b/pkg/plugin/run.go index 36aa47d15..b49da078c 100644 --- a/pkg/plugin/run.go +++ b/pkg/plugin/run.go @@ -309,12 +309,7 @@ func (e *AutoscaleEnforcer) handleResources( }), ) - result := api.Resources{ - VCPU: pod.cpu.Reserved, - Mem: pod.mem.Reserved, - } - - return result, 200, nil + return api.Resources{VCPU: pod.cpu.Reserved, Mem: pod.mem.Reserved}, 200, nil } func (e *AutoscaleEnforcer) updateMetricsAndCheckMustMigrate( From ca9e6c49908338ba4175067a3e62f352cef39b16 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 8 Jul 2024 17:13:46 +0400 Subject: [PATCH 12/57] couple of renames Signed-off-by: Oleg Vasilev --- pkg/agent/core/state.go | 12 ++++++------ pkg/agent/core/state_test.go | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 6ba2bcbfd..e6e9f9f77 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -838,15 +838,15 @@ func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) ( return nil } } - s.updateDesiredClock(now, result, s.VM.Using(), requestedUpscalingAffectedResult) - s.updateCurrentClock(s.VM.CurrentLogicalTime) + s.updateDesiredLogicalTime(now, result, s.VM.Using(), requestedUpscalingAffectedResult) + s.updateCurrentLogicalTime(s.VM.CurrentLogicalTime) s.info("Calculated desired resources", zap.Object("current", s.VM.Using()), zap.Object("target", result)) return result, calculateWaitTime } -func (s *state) updateDesiredClock( +func (s *state) updateDesiredLogicalTime( now time.Time, desired api.Resources, current api.Resources, @@ -866,7 +866,7 @@ func (s *state) updateDesiredClock( s.DesiredLogicalTime = s.ClockSource.Next(now, flags) } -func (s *state) updateCurrentClock(logicalTime *vmv1.LogicalTime) { +func (s *state) updateCurrentLogicalTime(logicalTime *vmv1.LogicalTime) { err := s.ClockSource.Observe(logicalTime) if err != nil { s.warnf("Failed to observe clock source: %v", err) @@ -1178,8 +1178,8 @@ func (s *State) NeonVM() NeonVMHandle { return NeonVMHandle{&s.internal} } -func (s *State) UpdateCurrentClock(logicalTime *vmv1.LogicalTime) { - s.internal.updateCurrentClock(logicalTime) +func (s *State) UpdateCurrentLogicalTime(logicalTime *vmv1.LogicalTime) { + s.internal.updateCurrentLogicalTime(logicalTime) } func (h NeonVMHandle) StartingRequest(now time.Time, resources api.Resources) { diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 3ee52345a..0cde6ea4c 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -327,7 +327,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Wait: &core.ActionWait{Duration: duration("4.8s")}, }) a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - state.UpdateCurrentClock(lt) + state.UpdateCurrentLogicalTime(lt) lt = logicalTime(clock, 6) From 5561734e0555afcb0a6a1edb3f84cb08e29f4eab Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Tue, 9 Jul 2024 14:45:01 +0400 Subject: [PATCH 13/57] add comments Signed-off-by: Oleg Vasilev --- pkg/agent/core/logiclock/logiclock.go | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/pkg/agent/core/logiclock/logiclock.go b/pkg/agent/core/logiclock/logiclock.go index 5eb9e5276..02b5a82f4 100644 --- a/pkg/agent/core/logiclock/logiclock.go +++ b/pkg/agent/core/logiclock/logiclock.go @@ -9,6 +9,7 @@ import ( vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" ) +// Flag is a set of flags that can be associated with a logical timestamp. type Flag uint64 const ( @@ -29,17 +30,24 @@ func (f Flag) Has(flag Flag) bool { return f&flag != 0 } -type measurement struct { - createdAt time.Time - flags Flag -} - +// Clock can generate and observe logical time. +// Each logical timestamp is associated with a physical timestamp and a set of flags upon creation. +// Once Clock observes a previously generated timestamp after some time, it will call the callback with +// the time difference and the flags associated with the timestamp. type Clock struct { - cb func(time.Duration, Flag) + cb func(time.Duration, Flag) + + // The in-flight timestamps are stored in-order. + // After the timestamp is observed, it is removed from the measurements, and the offset is increased. measurements []measurement offset int64 } +type measurement struct { + createdAt time.Time + flags Flag +} + func NewClock(cb func(time.Duration, Flag)) *Clock { return &Clock{ cb: cb, @@ -48,13 +56,13 @@ func NewClock(cb func(time.Duration, Flag)) *Clock { } } -func (c *Clock) NextValue() int64 { +func (c *Clock) nextValue() int64 { return c.offset + int64(len(c.measurements)) } func (c *Clock) Next(now time.Time, flags Flag) *vmv1.LogicalTime { ret := vmv1.LogicalTime{ - Value: c.NextValue(), + Value: c.nextValue(), UpdatedAt: v1.NewTime(now), } c.measurements = append(c.measurements, measurement{ @@ -69,6 +77,7 @@ func (c *Clock) Observe(logicalTime *vmv1.LogicalTime) error { return nil } if logicalTime.Value < c.offset { + // Already observed return nil } @@ -83,6 +92,7 @@ func (c *Clock) Observe(logicalTime *vmv1.LogicalTime) error { c.cb(diff, c.measurements[idx].flags) } + // Forget the measurement, and all the measurements before it. c.offset = logicalTime.Value + 1 c.measurements = c.measurements[idx+1:] From 63605e124016ba7a7564f1570e03c0eddfd944df Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Tue, 9 Jul 2024 15:18:42 +0400 Subject: [PATCH 14/57] don't exclude Action from exhaustruct Signed-off-by: Oleg Vasilev --- .golangci.yml | 1 - pkg/agent/core/state_test.go | 221 +++++++++++++++++++++-------------- 2 files changed, 132 insertions(+), 90 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 78ffb616d..4fa039051 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -79,7 +79,6 @@ linters-settings: - '^github\.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1\.VirtualMachine(Migration)?(Spec)?$' - '^github\.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1\.IPPool$' - '^github\.com/neondatabase/autoscaling/pkg/agent/core\.ActionSet$' - - '^github\.com/neondatabase/autoscaling/pkg/agent/core\.Action.*$' - '^github\.com/neondatabase/autoscaling/pkg/util/patch\.Operation$' - '^github\.com/neondatabase/autoscaling/pkg/util/watch\.HandlerFuncs$' diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 0cde6ea4c..1951fe75b 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -477,9 +477,10 @@ func TestPeriodicPluginRequest(t *testing.T) { } else { a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: &resources, - Target: resources, - Metrics: lo.ToPtr(metrics.ToAPI()), + LastPermit: &resources, + Target: resources, + Metrics: lo.ToPtr(metrics.ToAPI()), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resources) @@ -561,8 +562,9 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("6.8s")}, MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(6), - Target: resForCU(5), + Current: resForCU(6), + Target: resForCU(5), + DesiredLogicalTime: nil, }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(5)) @@ -583,16 +585,18 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { var expectedNeonVMRequest *core.ActionNeonVMRequest if cu < 5 { expectedNeonVMRequest = &core.ActionNeonVMRequest{ - Current: resForCU(6), - Target: resForCU(cu + 1), + Current: resForCU(6), + Target: resForCU(cu + 1), + DesiredLogicalTime: nil, } } a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: currentPluginWait}, MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(cu + 1), - Target: resForCU(cu), + Current: resForCU(cu + 1), + Target: resForCU(cu), + DesiredLogicalTime: nil, }, NeonVMRequest: expectedNeonVMRequest, }) @@ -632,9 +636,10 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("3.9s")}, PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(6)), - Target: resForCU(3), - Metrics: lo.ToPtr(metrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(6)), + Target: resForCU(3), + Metrics: lo.ToPtr(metrics.ToAPI()), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) @@ -658,8 +663,9 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("3.1s")}, MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(3), - Target: resForCU(2), + Current: resForCU(3), + Target: resForCU(2), + DesiredLogicalTime: nil, }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(2)) @@ -675,9 +681,10 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("1s")}, // still want to retry vm-monitor downscaling PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(3)), - Target: resForCU(3), - Metrics: lo.ToPtr(metrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(3)), + Target: resForCU(3), + Metrics: lo.ToPtr(metrics.ToAPI()), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) @@ -701,16 +708,18 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { var expectedNeonVMRequest *core.ActionNeonVMRequest if cu < 2 { expectedNeonVMRequest = &core.ActionNeonVMRequest{ - Current: resForCU(3), - Target: resForCU(cu + 1), + Current: resForCU(3), + Target: resForCU(cu + 1), + DesiredLogicalTime: nil, } } a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: currentPluginWait}, MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(cu + 1), - Target: resForCU(cu), + Current: resForCU(cu + 1), + Target: resForCU(cu), + DesiredLogicalTime: nil, }, NeonVMRequest: expectedNeonVMRequest, }) @@ -728,8 +737,9 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("5.8s")}, NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(3), - Target: resForCU(1), + Current: resForCU(3), + Target: resForCU(1), + DesiredLogicalTime: nil, }, }) a.Do(state.NeonVM().StartingRequest, time.Now(), resForCU(1)) @@ -741,9 +751,10 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { // Successfully downscaled, so now we should inform the plugin. Not waiting on any retries. a.Call(nextActions).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(3)), - Target: resForCU(1), - Metrics: lo.ToPtr(metrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(3)), + Target: resForCU(1), + Metrics: lo.ToPtr(metrics.ToAPI()), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) @@ -806,9 +817,10 @@ func TestRequestedUpscale(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("6s")}, // if nothing else happens, requested upscale expires. PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(1)), - Target: resForCU(2), - Metrics: lo.ToPtr(lastMetrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(1)), + Target: resForCU(2), + Metrics: lo.ToPtr(lastMetrics.ToAPI()), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) @@ -825,8 +837,9 @@ func TestRequestedUpscale(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin tick wait is earlier than requested upscale expiration NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(1), - Target: resForCU(2), + Current: resForCU(1), + Target: resForCU(2), + DesiredLogicalTime: nil, }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) @@ -837,8 +850,9 @@ func TestRequestedUpscale(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.8s")}, // still waiting on plugin tick MonitorUpscale: &core.ActionMonitorUpscale{ - Current: resForCU(1), - Target: resForCU(2), + Current: resForCU(1), + Target: resForCU(2), + DesiredLogicalTime: nil, }, }) a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) @@ -857,9 +871,10 @@ func TestRequestedUpscale(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("1s")}, PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(2)), - Target: resForCU(2), - Metrics: lo.ToPtr(lastMetrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(2)), + Target: resForCU(2), + Metrics: lo.ToPtr(lastMetrics.ToAPI()), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) @@ -880,8 +895,9 @@ func TestRequestedUpscale(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4s")}, // now, waiting on plugin request tick MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(2), - Target: resForCU(1), + Current: resForCU(2), + Target: resForCU(1), + DesiredLogicalTime: nil, }, }) } @@ -931,6 +947,8 @@ func TestDownscalePivotBack(t *testing.T) { MonitorDownscale: &core.ActionMonitorDownscale{ Current: resForCU(2), Target: resForCU(1), + + DesiredLogicalTime: nil, }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) @@ -946,8 +964,9 @@ func TestDownscalePivotBack(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: *pluginWait}, MonitorUpscale: &core.ActionMonitorUpscale{ - Current: resForCU(1), - Target: resForCU(2), + Current: resForCU(1), + Target: resForCU(2), + DesiredLogicalTime: nil, }, }) a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) @@ -964,8 +983,9 @@ func TestDownscalePivotBack(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: *pluginWait}, NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(2), - Target: resForCU(1), + Current: resForCU(2), + Target: resForCU(1), + DesiredLogicalTime: nil, }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) @@ -981,8 +1001,9 @@ func TestDownscalePivotBack(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: *pluginWait}, NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(1), - Target: resForCU(2), + Current: resForCU(1), + Target: resForCU(2), + DesiredLogicalTime: nil, }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) @@ -998,9 +1019,10 @@ func TestDownscalePivotBack(t *testing.T) { t.Log(" > start plugin downscale") a.Call(nextActions).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(2)), - Target: resForCU(1), - Metrics: lo.ToPtr(initialMetrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(2)), + Target: resForCU(1), + Metrics: lo.ToPtr(initialMetrics.ToAPI()), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) @@ -1018,9 +1040,10 @@ func TestDownscalePivotBack(t *testing.T) { t.Log(" > start plugin upscale") a.Call(nextActions).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(1)), - Target: resForCU(2), - Metrics: lo.ToPtr(newMetrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(1)), + Target: resForCU(2), + Metrics: lo.ToPtr(newMetrics.ToAPI()), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) @@ -1138,6 +1161,8 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { MonitorDownscale: &core.ActionMonitorDownscale{ Current: resForCU(2), Target: resForCU(1), + + DesiredLogicalTime: nil, }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) @@ -1149,6 +1174,8 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(2), Target: resForCU(1), + + DesiredLogicalTime: nil, }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) @@ -1157,9 +1184,10 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { // Do plugin request for that downscaling: a.Call(nextActions).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(2)), - Target: resForCU(1), - Metrics: lo.ToPtr(metrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(2)), + Target: resForCU(1), + Metrics: lo.ToPtr(metrics.ToAPI()), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) @@ -1227,9 +1255,10 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) { // We should be making a plugin request to get upscaling: a.Call(nextActions).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(2)), - Target: resForCU(3), - Metrics: lo.ToPtr(metrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(2)), + Target: resForCU(3), + Metrics: lo.ToPtr(metrics.ToAPI()), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) @@ -1242,8 +1271,9 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.9s")}, NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(2), - Target: resForCU(3), + Current: resForCU(2), + Target: resForCU(3), + DesiredLogicalTime: nil, }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(3)) @@ -1253,8 +1283,9 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.8s")}, MonitorUpscale: &core.ActionMonitorUpscale{ - Current: resForCU(2), - Target: resForCU(3), + Current: resForCU(2), + Target: resForCU(3), + DesiredLogicalTime: nil, }, }) a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(3)) @@ -1306,9 +1337,10 @@ func TestFailedRequestRetry(t *testing.T) { // We should be asking the scheduler for upscaling a.Call(nextActions).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(1)), - Target: resForCU(2), - Metrics: lo.ToPtr(metrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(1)), + Target: resForCU(2), + Metrics: lo.ToPtr(metrics.ToAPI()), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) @@ -1325,9 +1357,10 @@ func TestFailedRequestRetry(t *testing.T) { // ... and then retry: a.Call(nextActions).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(1)), - Target: resForCU(2), - Metrics: lo.ToPtr(metrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(1)), + Target: resForCU(2), + Metrics: lo.ToPtr(metrics.ToAPI()), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) @@ -1342,8 +1375,9 @@ func TestFailedRequestRetry(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin request tick NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(1), - Target: resForCU(2), + Current: resForCU(1), + Target: resForCU(2), + DesiredLogicalTime: nil, }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) @@ -1360,8 +1394,9 @@ func TestFailedRequestRetry(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("1.8s")}, // plugin request tick NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(1), - Target: resForCU(2), + Current: resForCU(1), + Target: resForCU(2), + DesiredLogicalTime: nil, }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) @@ -1372,8 +1407,9 @@ func TestFailedRequestRetry(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("1.7s")}, // plugin request tick MonitorUpscale: &core.ActionMonitorUpscale{ - Current: resForCU(1), - Target: resForCU(2), + Current: resForCU(1), + Target: resForCU(2), + DesiredLogicalTime: nil, }, }) } @@ -1409,9 +1445,10 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { Call(state.NextActions, clock.Now()). Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: nil, - Target: resForCU(3), - Metrics: nil, + LastPermit: nil, + Target: resForCU(3), + Metrics: nil, + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) @@ -1429,8 +1466,9 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.8s")}, MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(3), - Target: resForCU(2), + Current: resForCU(3), + Target: resForCU(2), + DesiredLogicalTime: nil, }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(2)) @@ -1459,12 +1497,14 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.6s")}, // plugin request tick wait NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(3), - Target: resForCU(2), + Current: resForCU(3), + Target: resForCU(2), + DesiredLogicalTime: nil, }, MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(2), - Target: resForCU(1), + Current: resForCU(2), + Target: resForCU(1), + DesiredLogicalTime: nil, }, }) // Start both requests. The vm-monitor request will finish first, but after that we'll just be @@ -1495,13 +1535,15 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { // incorrectly for 1 CU, rather than 2 CU. So, the rest of this test case is mostly just // rounding out the rest of the scale-down routine. PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(3)), - Target: resForCU(2), - Metrics: lo.ToPtr(metrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(3)), + Target: resForCU(2), + Metrics: lo.ToPtr(metrics.ToAPI()), + DesiredLogicalTime: nil, }, NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(2), - Target: resForCU(1), + Current: resForCU(2), + Target: resForCU(1), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) @@ -1525,9 +1567,10 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { a.Do(state.NeonVM().RequestSuccessful, clock.Now()) a.Call(nextActions).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(2)), - Target: resForCU(1), - Metrics: lo.ToPtr(metrics.ToAPI()), + LastPermit: lo.ToPtr(resForCU(2)), + Target: resForCU(1), + Metrics: lo.ToPtr(metrics.ToAPI()), + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) From b43dd404bb1d6f65f9e71b64aefadb9e97e79bb3 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Tue, 9 Jul 2024 16:09:11 +0400 Subject: [PATCH 15/57] move DesiredLogicalTime out of Guest Signed-off-by: Oleg Vasilev --- neonvm/apis/neonvm/v1/virtualmachine_types.go | 8 +++--- .../apis/neonvm/v1/zz_generated.deepcopy.go | 10 +++---- .../bases/vm.neon.tech_virtualmachines.yaml | 28 +++++++++---------- neonvm/controllers/vm_controller.go | 2 +- pkg/agent/runner.go | 2 +- 5 files changed, 25 insertions(+), 25 deletions(-) diff --git a/neonvm/apis/neonvm/v1/virtualmachine_types.go b/neonvm/apis/neonvm/v1/virtualmachine_types.go index 8c744c82f..a1c803c54 100644 --- a/neonvm/apis/neonvm/v1/virtualmachine_types.go +++ b/neonvm/apis/neonvm/v1/virtualmachine_types.go @@ -140,6 +140,10 @@ type VirtualMachineSpec struct { // +kubebuilder:default:=true // +optional EnableSSH *bool `json:"enableSSH,omitempty"` + + // Logical timestamp corresponding to the desired resources of the VM. + // +optional + DesiredLogicalTime *LogicalTime `json:"desiredLogicalTime,omitempty"` } func (spec *VirtualMachineSpec) Resources() VirtualMachineResources { @@ -192,10 +196,6 @@ type Guest struct { // +optional Ports []Port `json:"ports,omitempty"` - // Logical clock value corresponding to the desired resources of the VM. - // +optional - DesiredLogicalTime *LogicalTime `json:"desiredLogicalTime,omitempty"` - // Additional settings for the VM. // Cannot be updated. // +optional diff --git a/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go b/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go index a41843f10..e78d5b5d0 100644 --- a/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go +++ b/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go @@ -186,11 +186,6 @@ func (in *Guest) DeepCopyInto(out *Guest) { *out = make([]Port, len(*in)) copy(*out, *in) } - if in.DesiredLogicalTime != nil { - in, out := &in.DesiredLogicalTime, &out.DesiredLogicalTime - *out = new(LogicalTime) - (*in).DeepCopyInto(*out) - } if in.Settings != nil { in, out := &in.Settings, &out.Settings *out = new(GuestSettings) @@ -744,6 +739,11 @@ func (in *VirtualMachineSpec) DeepCopyInto(out *VirtualMachineSpec) { *out = new(bool) **out = **in } + if in.DesiredLogicalTime != nil { + in, out := &in.DesiredLogicalTime, &out.DesiredLogicalTime + *out = new(LogicalTime) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineSpec. diff --git a/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml b/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml index 45d864501..0666407e2 100644 --- a/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml +++ b/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml @@ -892,6 +892,20 @@ spec: type: array type: object type: object + desiredLogicalTime: + description: Logical timestamp corresponding to the desired resources + of the VM. + properties: + updatedAt: + format: date-time + type: string + value: + format: int64 + type: integer + required: + - updatedAt + - value + type: object disks: description: List of disk that can be mounted by virtual machine. items: @@ -2397,20 +2411,6 @@ spec: - min - use type: object - desiredLogicalTime: - description: Logical clock value corresponding to the desired - resources of the VM. - properties: - updatedAt: - format: date-time - type: string - value: - format: int64 - type: integer - required: - - updatedAt - - value - type: object env: description: List of environment variables to set in the vmstart process. diff --git a/neonvm/controllers/vm_controller.go b/neonvm/controllers/vm_controller.go index 573f1957e..5c1a356e0 100644 --- a/neonvm/controllers/vm_controller.go +++ b/neonvm/controllers/vm_controller.go @@ -801,7 +801,7 @@ func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine) } if vm.Status.Phase == vmv1.VmRunning { - vm.Status.CurrentLogicalTime = vm.Spec.Guest.DesiredLogicalTime.RewindNow() + vm.Status.CurrentLogicalTime = vm.Spec.DesiredLogicalTime.RewindNow() } return nil diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go index 2b4fe1a47..aaad51894 100644 --- a/pkg/agent/runner.go +++ b/pkg/agent/runner.go @@ -655,7 +655,7 @@ func (r *Runner) doNeonVMRequest( Value: uint32(target.Mem / r.memSlotSize), }, { Op: patch.OpReplace, - Path: "/spec/guest/desiredLogicalTime", + Path: "/spec/desiredLogicalTime", Value: desiredLogicalTime, }} From eb1e6b131d40e5446764f26506778b01321a7422 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Tue, 9 Jul 2024 17:43:26 +0400 Subject: [PATCH 16/57] small changes Signed-off-by: Oleg Vasilev --- neonvm/apis/neonvm/v1/virtualmachine_types.go | 8 ++++++++ pkg/agent/core/state.go | 7 +++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/neonvm/apis/neonvm/v1/virtualmachine_types.go b/neonvm/apis/neonvm/v1/virtualmachine_types.go index a1c803c54..911e0f995 100644 --- a/neonvm/apis/neonvm/v1/virtualmachine_types.go +++ b/neonvm/apis/neonvm/v1/virtualmachine_types.go @@ -24,6 +24,7 @@ import ( "time" "github.com/samber/lo" + "go.uber.org/zap/zapcore" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -226,6 +227,13 @@ type LogicalTime struct { UpdatedAt metav1.Time `json:"updatedAt"` } +// MarshalLogObject implements zapcore.ObjectMarshaler, so that LogicalTime can be used with zap.Object +func (r *LogicalTime) MarshalLogObject(enc zapcore.ObjectEncoder) error { + enc.AddInt64("value", r.Value) + enc.AddTime("updatedAt", r.UpdatedAt.Time) + return nil +} + func (t *LogicalTime) Rewind(now time.Time) *LogicalTime { if t == nil { return nil diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index e6e9f9f77..54c63b8f8 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -839,9 +839,11 @@ func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) ( } } s.updateDesiredLogicalTime(now, result, s.VM.Using(), requestedUpscalingAffectedResult) - s.updateCurrentLogicalTime(s.VM.CurrentLogicalTime) - s.info("Calculated desired resources", zap.Object("current", s.VM.Using()), zap.Object("target", result)) + s.info("Calculated desired resources", + zap.Object("current", s.VM.Using()), + zap.Object("target", result), + zap.Object("desiredLogicalTime", s.DesiredLogicalTime)) return result, calculateWaitTime } @@ -1008,6 +1010,7 @@ func (s *State) UpdatedVM(vm api.VmInfo) { // - https://github.com/neondatabase/autoscaling/issues/462 vm.SetUsing(s.internal.VM.Using()) s.internal.VM = vm + s.internal.updateCurrentLogicalTime(vm.CurrentLogicalTime) } func (s *State) UpdateSystemMetrics(metrics SystemMetrics) { From 3118e51807e3af62b08bd47b24c02657cd66316c Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 11 Jul 2024 13:21:48 +0400 Subject: [PATCH 17/57] move labels calculation into logiclock pkg Signed-off-by: Oleg Vasilev --- pkg/agent/core/logiclock/logiclock.go | 17 +++++++++++++++++ pkg/agent/core/state.go | 5 +++++ pkg/agent/prommetrics.go | 3 ++- pkg/agent/runner.go | 16 ++++------------ 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/pkg/agent/core/logiclock/logiclock.go b/pkg/agent/core/logiclock/logiclock.go index 02b5a82f4..935691c6f 100644 --- a/pkg/agent/core/logiclock/logiclock.go +++ b/pkg/agent/core/logiclock/logiclock.go @@ -30,6 +30,23 @@ func (f Flag) Has(flag Flag) bool { return f&flag != 0 } +// AllFlags and AllFlagNames must have the same order, so the metrics work correctly. +var AllFlags = []Flag{Upscale, Downscale, Immediate} +var AllFlagNames = []string{"upscale", "downscale", "immediate"} + +// FlagsToLabels converts a set of flags to a list of strings which prometheus can take. +func FlagsToLabels(flags Flag) []string { + var ret []string + for _, flag := range AllFlags { + value := "false" + if flags.Has(flag) { + value = "true" + } + ret = append(ret, value) + } + return ret +} + // Clock can generate and observe logical time. // Each logical timestamp is associated with a physical timestamp and a set of flags upon creation. // Once Clock observes a previously generated timestamp after some time, it will call the callback with diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 54c63b8f8..225812490 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -865,6 +865,11 @@ func (s *state) updateDesiredLogicalTime( flags.Set(logiclock.Immediate) } + if flags == 0 { + // Nothing changed, so no need to update the logical time + return + } + s.DesiredLogicalTime = s.ClockSource.Next(now, flags) } diff --git a/pkg/agent/prommetrics.go b/pkg/agent/prommetrics.go index dcbaae709..41ffcb6aa 100644 --- a/pkg/agent/prommetrics.go +++ b/pkg/agent/prommetrics.go @@ -4,6 +4,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/collectors" + "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" "github.com/neondatabase/autoscaling/pkg/util" ) @@ -224,7 +225,7 @@ func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { prometheus.HistogramOpts{ Name: "autoscaling_agent_scaling_latency_seconds", Help: "End-to-end scaling latency", - }, []string{"upscale", "downscale", "immediate"}, + }, logiclock.AllFlagNames, )), } diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go index aaad51894..5f889a186 100644 --- a/pkg/agent/runner.go +++ b/pkg/agent/runner.go @@ -196,18 +196,10 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util pluginRequestJitter := util.NewTimeRange(time.Millisecond, 0, 100).Random() coreExecLogger := execLogger.Named("core") - clock := logiclock.NewClock(func(duration time.Duration, flag logiclock.Flag) { - labels := []string{"false", "false", "false"} - if flag.Has(logiclock.Upscale) { - labels[0] = "true" - } - if flag.Has(logiclock.Downscale) { - labels[1] = "true" - } - if flag.Has(logiclock.Immediate) { - labels[2] = "true" - } - r.global.metrics.scalingLatency.WithLabelValues(labels...).Observe(duration.Seconds()) + clock := logiclock.NewClock(func(duration time.Duration, flags logiclock.Flag) { + r.global.metrics.scalingLatency. + WithLabelValues(logiclock.FlagsToLabels(flags)...). + Observe(duration.Seconds()) }) executorCore := executor.NewExecutorCore(coreExecLogger, getVmInfo(), executor.Config{ OnNextActions: r.global.metrics.runnerNextActions.Inc, From 5a6b537c7bd71eb96c45d896835739043ee29fe7 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 11 Jul 2024 13:25:15 +0400 Subject: [PATCH 18/57] cleanup extract vm info Signed-off-by: Oleg Vasilev --- pkg/api/vminfo.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pkg/api/vminfo.go b/pkg/api/vminfo.go index 6bd02c3e7..3ce73a4e8 100644 --- a/pkg/api/vminfo.go +++ b/pkg/api/vminfo.go @@ -151,7 +151,13 @@ func (vm VmInfo) NamespacedName() util.NamespacedName { func ExtractVmInfo(logger *zap.Logger, vm *vmapi.VirtualMachine) (*VmInfo, error) { logger = logger.With(util.VMNameFields(vm)) - return extractVmInfoGeneric(logger, vm.Name, vm, vm.Spec.Resources(), vm.Status.CurrentLogicalTime) + info, err := extractVmInfoGeneric(logger, vm.Name, vm, vm.Spec.Resources()) + if err != nil { + return nil, fmt.Errorf("error extracting VM info: %w", err) + } + + info.CurrentLogicalTime = vm.Status.CurrentLogicalTime + return info, nil } func ExtractVmInfoFromPod(logger *zap.Logger, pod *corev1.Pod) (*VmInfo, error) { @@ -165,7 +171,7 @@ func ExtractVmInfoFromPod(logger *zap.Logger, pod *corev1.Pod) (*VmInfo, error) } vmName := pod.Labels[vmapi.VirtualMachineNameLabel] - return extractVmInfoGeneric(logger, vmName, pod, resources, nil) + return extractVmInfoGeneric(logger, vmName, pod, resources) } func extractVmInfoGeneric( @@ -173,7 +179,6 @@ func extractVmInfoGeneric( vmName string, obj metav1.ObjectMetaAccessor, resources vmapi.VirtualMachineResources, - currentClock *vmapi.LogicalTime, ) (*VmInfo, error) { cpuInfo := NewVmCpuInfo(resources.CPUs) memInfo := NewVmMemInfo(resources.MemorySlots, resources.MemorySlotSize) @@ -193,7 +198,7 @@ func extractVmInfoGeneric( ScalingEnabled: scalingEnabled, ScalingConfig: nil, // set below, maybe }, - CurrentLogicalTime: currentClock, + CurrentLogicalTime: nil, // set later, maybe } if boundsJSON, ok := obj.GetObjectMeta().GetAnnotations()[AnnotationAutoscalingBounds]; ok { From e26c155bee2efdcc93525fcc2ef1d07411facfe0 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 11 Jul 2024 14:03:06 +0400 Subject: [PATCH 19/57] add tests for logic clock Signed-off-by: Oleg Vasilev --- pkg/agent/core/dumpstate.go | 19 +++++++------ pkg/agent/core/state.go | 37 ++++++++++++++++-------- pkg/agent/core/state_test.go | 55 ++++++++++++++++++++++++------------ 3 files changed, 73 insertions(+), 38 deletions(-) diff --git a/pkg/agent/core/dumpstate.go b/pkg/agent/core/dumpstate.go index 8a3bc8ad4..fdd74b2e6 100644 --- a/pkg/agent/core/dumpstate.go +++ b/pkg/agent/core/dumpstate.go @@ -34,15 +34,16 @@ func (d StateDump) MarshalJSON() ([]byte, error) { func (s *State) Dump() StateDump { return StateDump{ internal: state{ - Debug: s.internal.Debug, - Config: s.internal.Config, - VM: s.internal.VM, - Plugin: s.internal.Plugin.deepCopy(), - Monitor: s.internal.Monitor.deepCopy(), - NeonVM: s.internal.NeonVM.deepCopy(), - Metrics: shallowCopy[SystemMetrics](s.internal.Metrics), - ClockSource: s.internal.ClockSource, - DesiredLogicalTime: s.internal.DesiredLogicalTime, + Debug: s.internal.Debug, + Config: s.internal.Config, + VM: s.internal.VM, + Plugin: s.internal.Plugin.deepCopy(), + Monitor: s.internal.Monitor.deepCopy(), + NeonVM: s.internal.NeonVM.deepCopy(), + Metrics: shallowCopy[SystemMetrics](s.internal.Metrics), + ClockSource: s.internal.ClockSource, + DesiredLogicalTime: s.internal.DesiredLogicalTime, + LastDesiredResources: s.internal.LastDesiredResources, }, } } diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 225812490..6a2828760 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -117,8 +117,13 @@ type state struct { Metrics *SystemMetrics - ClockSource LogicClock `json:"-"` + ClockSource LogicClock `json:"-"` + + // DesiredLogicalTime is the logical time autoscaler-agent currently works to achieve. DesiredLogicalTime *vmv1.LogicalTime + + // LastDesiredResources is the last target agent wanted to scale to. + LastDesiredResources *api.Resources } type pluginState struct { @@ -237,9 +242,10 @@ func NewState(vm api.VmInfo, config Config, clockSource LogicClock) *State { OngoingRequested: nil, RequestFailedAt: nil, }, - Metrics: nil, - ClockSource: clockSource, - DesiredLogicalTime: nil, + Metrics: nil, + ClockSource: clockSource, + DesiredLogicalTime: nil, + LastDesiredResources: nil, }, } @@ -840,6 +846,8 @@ func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) ( } s.updateDesiredLogicalTime(now, result, s.VM.Using(), requestedUpscalingAffectedResult) + s.LastDesiredResources = &result + s.info("Calculated desired resources", zap.Object("current", s.VM.Using()), zap.Object("target", result), @@ -865,15 +873,22 @@ func (s *state) updateDesiredLogicalTime( flags.Set(logiclock.Immediate) } - if flags == 0 { - // Nothing changed, so no need to update the logical time - return + if s.LastDesiredResources == nil { + if desired == current { + // First iteration, but no scaling required + return + } + } else { + if *s.LastDesiredResources == desired { + // Nothing changed, so no need to update the logical time + return + } } s.DesiredLogicalTime = s.ClockSource.Next(now, flags) } -func (s *state) updateCurrentLogicalTime(logicalTime *vmv1.LogicalTime) { +func (s *state) updateLogicalTime(logicalTime *vmv1.LogicalTime) { err := s.ClockSource.Observe(logicalTime) if err != nil { s.warnf("Failed to observe clock source: %v", err) @@ -1015,7 +1030,7 @@ func (s *State) UpdatedVM(vm api.VmInfo) { // - https://github.com/neondatabase/autoscaling/issues/462 vm.SetUsing(s.internal.VM.Using()) s.internal.VM = vm - s.internal.updateCurrentLogicalTime(vm.CurrentLogicalTime) + s.internal.updateLogicalTime(vm.CurrentLogicalTime) } func (s *State) UpdateSystemMetrics(metrics SystemMetrics) { @@ -1186,8 +1201,8 @@ func (s *State) NeonVM() NeonVMHandle { return NeonVMHandle{&s.internal} } -func (s *State) UpdateCurrentLogicalTime(logicalTime *vmv1.LogicalTime) { - s.internal.updateCurrentLogicalTime(logicalTime) +func (s *State) UpdateLogicalTime(logicalTime *vmv1.LogicalTime) { + s.internal.updateLogicalTime(logicalTime) } func (h NeonVMHandle) StartingRequest(now time.Time, resources api.Resources) { diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 1951fe75b..7b514e0cd 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -2,6 +2,7 @@ package core_test import ( "fmt" + "github.com/stretchr/testify/assert" "testing" "time" @@ -217,17 +218,13 @@ func logicalTime(clock *helpers.FakeClock, value int64) *vmv1.LogicalTime { } } -func doInitialPluginRequest(a helpers.Assert, state *core.State, clock *helpers.FakeClock, requestTime time.Duration, metrics *api.Metrics, resources api.Resources, enableLogicalClock bool) { - var lt *vmv1.LogicalTime - if enableLogicalClock { - lt = logicalTime(clock, 0) - } +func doInitialPluginRequest(a helpers.Assert, state *core.State, clock *helpers.FakeClock, requestTime time.Duration, metrics *api.Metrics, resources api.Resources, _ bool) { a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ LastPermit: nil, Target: resources, Metrics: metrics, - DesiredLogicalTime: lt, + DesiredLogicalTime: nil, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resources) @@ -256,7 +253,17 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { } resForCU := DefaultComputeUnit.Mul - logicClock := logiclock.NewClock(nil) + var latencyObservations []struct { + latency time.Duration + flags logiclock.Flag + } + + logicClock := logiclock.NewClock(func(latency time.Duration, flags logiclock.Flag) { + latencyObservations = append(latencyObservations, struct { + latency time.Duration + flags logiclock.Flag + }{latency, flags}) + }) state := helpers.CreateInitialState( DefaultInitialStateConfig, @@ -284,10 +291,10 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { a.Call(getDesiredResources, state, clock.Now()). Equals(resForCU(2)) - lt := logicalTime(clock, 2) - // Now that the initial scheduler request is done, and we have metrics that indicate - // scale-up would be a good idea, we should be contacting the scheduler to get approval. + // scale-up would be a good idea. Logical time nil -> 0. + lt := logicalTime(clock, 0) + // We should be contacting the scheduler to get approval. a.Call(nextActions).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ LastPermit: lo.ToPtr(resForCU(1)), @@ -305,7 +312,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Permit: resForCU(2), Migrate: nil, }) - state.Plugin().UpdateLogicalTime(lt) + state.Plugin().UpdateLogicalTime(lt.Rewind(clock.Now())) // Scheduler approval is done, now we should be making the request to NeonVM a.Call(nextActions).Equals(core.ActionSet{ @@ -326,10 +333,17 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.8s")}, }) + + // Until NeonVM is successful, we won't see any observations. + assert.Empty(t, latencyObservations) + a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - state.UpdateCurrentLogicalTime(lt) + state.UpdateLogicalTime(lt.Rewind(clock.Now())) - lt = logicalTime(clock, 6) + assert.Len(t, latencyObservations, 1) + // We started at 0.2s and finished at 0.4s + assert.Equal(t, duration("0.2s"), latencyObservations[0].latency) + assert.Equal(t, logiclock.Upscale, latencyObservations[0].flags) // NeonVM change is done, now we should finish by notifying the vm-monitor a.Call(nextActions).Equals(core.ActionSet{ @@ -348,7 +362,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Wait: &core.ActionWait{Duration: duration("4.7s")}, }) a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) - state.Monitor().UpdateLogicalTime(lt) + state.Monitor().UpdateLogicalTime(lt.Rewind(clock.Now())) // And now, double-check that there's no sneaky follow-up actions before we change the // metrics @@ -370,7 +384,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { a.Call(getDesiredResources, state, clock.Now()). Equals(resForCU(1)) - lt = logicalTime(clock, 10) + lt = logicalTime(clock, 1) // First step in downscaling is getting approval from the vm-monitor: a.Call(nextActions).Equals(core.ActionSet{ @@ -388,7 +402,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Wait: &core.ActionWait{Duration: duration("4.5s")}, }) a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) - state.Monitor().UpdateLogicalTime(lt) + state.Monitor().UpdateLogicalTime(lt.Rewind(clock.Now())) // After getting approval from the vm-monitor, we make the request to NeonVM to carry it out a.Call(nextActions).Equals(core.ActionSet{ @@ -406,8 +420,13 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Wait: &core.ActionWait{Duration: duration("4.4s")}, }) a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + state.UpdateLogicalTime(lt.Rewind(clock.Now())) - lt = logicalTime(clock, 14) + // One more latency observation + assert.Len(t, latencyObservations, 2) + // We started at 0.6s and finished at 0.8s + assert.Equal(t, duration("0.2s"), latencyObservations[1].latency) + assert.Equal(t, logiclock.Downscale, latencyObservations[1].flags) // Request to NeonVM completed, it's time to inform the scheduler plugin: a.Call(nextActions).Equals(core.ActionSet{ @@ -427,7 +446,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Permit: resForCU(1), Migrate: nil, }) - state.Plugin().UpdateLogicalTime(lt) + state.Plugin().UpdateLogicalTime(lt.Rewind(clock.Now())) // Finally, check there's no leftover actions: a.Call(nextActions).Equals(core.ActionSet{ From fa758c04197097ecfd14777bca7db9079333571d Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 11 Jul 2024 14:10:56 +0400 Subject: [PATCH 20/57] get rid of separate UpdateLogicalTime Signed-off-by: Oleg Vasilev --- pkg/agent/core/state.go | 4 ---- pkg/agent/core/state_test.go | 21 ++++++++++++++++----- pkg/agent/core/testhelpers/construct.go | 6 ++++++ 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 6a2828760..e178a42ed 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -1201,10 +1201,6 @@ func (s *State) NeonVM() NeonVMHandle { return NeonVMHandle{&s.internal} } -func (s *State) UpdateLogicalTime(logicalTime *vmv1.LogicalTime) { - s.internal.updateLogicalTime(logicalTime) -} - func (h NeonVMHandle) StartingRequest(now time.Time, resources api.Resources) { // FIXME: add time to ongoing request info (or maybe only in RequestFailed?) h.s.NeonVM.OngoingRequested = &resources diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 7b514e0cd..e0e9f0337 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -338,7 +338,10 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { assert.Empty(t, latencyObservations) a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - state.UpdateLogicalTime(lt.Rewind(clock.Now())) + a.Do(state.UpdatedVM, helpers.CreateVmInfo( + DefaultInitialStateConfig.VM, + helpers.WithLogicalTime(lt.Rewind(clock.Now())), + )) assert.Len(t, latencyObservations, 1) // We started at 0.2s and finished at 0.4s @@ -420,12 +423,20 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Wait: &core.ActionWait{Duration: duration("4.4s")}, }) a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - state.UpdateLogicalTime(lt.Rewind(clock.Now())) + + // Update the VM to set current=1, but first wait 0.1s + clockTick().AssertEquals(duration("0.9s")) + a.Do(state.UpdatedVM, helpers.CreateVmInfo( + DefaultInitialStateConfig.VM, + helpers.WithCurrentCU(1), + helpers.WithMinMaxCU(1, 1), + helpers.WithLogicalTime(lt.Rewind(clock.Now())), + )) // One more latency observation assert.Len(t, latencyObservations, 2) - // We started at 0.6s and finished at 0.8s - assert.Equal(t, duration("0.2s"), latencyObservations[1].latency) + // We started at 0.6s and finished at 0.9s + assert.Equal(t, duration("0.3s"), latencyObservations[1].latency) assert.Equal(t, logiclock.Downscale, latencyObservations[1].flags) // Request to NeonVM completed, it's time to inform the scheduler plugin: @@ -439,7 +450,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { // shouldn't have anything to say to the other components }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) - clockTick().AssertEquals(duration("0.9s")) + clockTick().AssertEquals(duration("1s")) // should have nothing more to do; waiting on plugin request to come back a.Call(nextActions).Equals(core.ActionSet{}) a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ diff --git a/pkg/agent/core/testhelpers/construct.go b/pkg/agent/core/testhelpers/construct.go index 6c1468349..301807f13 100644 --- a/pkg/agent/core/testhelpers/construct.go +++ b/pkg/agent/core/testhelpers/construct.go @@ -179,6 +179,12 @@ func WithCurrentCU(cu uint16) VmInfoOpt { }) } +func WithLogicalTime(t *vmapi.LogicalTime) VmInfoOpt { + return vmInfoModifier(func(c InitialVmInfoConfig, vm *api.VmInfo) { + vm.CurrentLogicalTime = t + }) +} + func WithClock(c core.LogicClock) ClockSourceOpt { return clockInjector(func() core.LogicClock { return c From 5f9c9d10bb907f90127a64e8ff80963a036641ce Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 11 Jul 2024 14:35:37 +0400 Subject: [PATCH 21/57] rewind clock everywhere Signed-off-by: Oleg Vasilev --- neonvm/apis/neonvm/v1/virtualmachine_types.go | 2 +- pkg/agent/core/state.go | 15 ++++++--------- pkg/agent/core/state_test.go | 14 ++++++++++---- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/neonvm/apis/neonvm/v1/virtualmachine_types.go b/neonvm/apis/neonvm/v1/virtualmachine_types.go index 911e0f995..ca429277a 100644 --- a/neonvm/apis/neonvm/v1/virtualmachine_types.go +++ b/neonvm/apis/neonvm/v1/virtualmachine_types.go @@ -255,7 +255,7 @@ func (t *LogicalTime) Earliest(other *LogicalTime) *LogicalTime { if other == nil { return t } - if t.UpdatedAt.Before(&other.UpdatedAt) { + if t.Value < other.Value { return t } return other diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index e178a42ed..aa528265f 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -442,7 +442,7 @@ func (s *state) calculatePluginAction( return nil } }(), - DesiredLogicalTime: desiredLogicalTime, + DesiredLogicalTime: desiredLogicalTime.Rewind(now), }, nil } else { if wantToRequestNewResources && waitingOnRetryBackoff { @@ -463,13 +463,10 @@ func (s *state) calculateNeonVMAction( desiredResources api.Resources, pluginRequested *api.Resources, pluginRequestedPhase string, - logicalTime *vmv1.LogicalTime, + desiredTime *vmv1.LogicalTime, ) (*ActionNeonVMRequest, *time.Duration) { - - desiredTime := logicalTime - if desiredResources.HasFieldLessThan(s.VM.Using()) { - // We are downscaling, so we needed a permit from the¡ monitor + // We are downscaling, so we needed a permit from the monitor desiredTime = desiredTime.Earliest(s.Monitor.CurrentLogicalTime) } @@ -507,7 +504,7 @@ func (s *state) calculateNeonVMAction( return &ActionNeonVMRequest{ Current: s.VM.Using(), Target: desiredResources, - DesiredLogicalTime: desiredTime, + DesiredLogicalTime: desiredTime.Rewind(now), }, nil } else { var reqs []string @@ -593,7 +590,7 @@ func (s *state) calculateMonitorUpscaleAction( return &ActionMonitorUpscale{ Current: *s.Monitor.Approved, Target: requestResources, - DesiredLogicalTime: desiredLogicalTime, + DesiredLogicalTime: desiredLogicalTime.Rewind(now), }, nil } @@ -681,7 +678,7 @@ func (s *state) calculateMonitorDownscaleAction( return &ActionMonitorDownscale{ Current: *s.Monitor.Approved, Target: requestResources, - DesiredLogicalTime: desiredLogicalTime, + DesiredLogicalTime: desiredLogicalTime.Rewind(now), }, nil } diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index e0e9f0337..0fb09b66b 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -312,7 +312,8 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Permit: resForCU(2), Migrate: nil, }) - state.Plugin().UpdateLogicalTime(lt.Rewind(clock.Now())) + lt = lt.Rewind(clock.Now()) + state.Plugin().UpdateLogicalTime(lt) // Scheduler approval is done, now we should be making the request to NeonVM a.Call(nextActions).Equals(core.ActionSet{ @@ -337,12 +338,15 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { // Until NeonVM is successful, we won't see any observations. assert.Empty(t, latencyObservations) + // Now NeonVM request is done. + lt = lt.Rewind(clock.Now()) a.Do(state.NeonVM().RequestSuccessful, clock.Now()) a.Do(state.UpdatedVM, helpers.CreateVmInfo( DefaultInitialStateConfig.VM, - helpers.WithLogicalTime(lt.Rewind(clock.Now())), + helpers.WithLogicalTime(lt), )) + // And we see the latency assert.Len(t, latencyObservations, 1) // We started at 0.2s and finished at 0.4s assert.Equal(t, duration("0.2s"), latencyObservations[0].latency) @@ -400,12 +404,13 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) clockTick().AssertEquals(duration("0.7s")) + lt = lt.Rewind(clock.Now()) // should have nothing more to do; waiting on vm-monitor request to come back a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.5s")}, }) a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) - state.Monitor().UpdateLogicalTime(lt.Rewind(clock.Now())) + state.Monitor().UpdateLogicalTime(lt) // After getting approval from the vm-monitor, we make the request to NeonVM to carry it out a.Call(nextActions).Equals(core.ActionSet{ @@ -426,11 +431,12 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { // Update the VM to set current=1, but first wait 0.1s clockTick().AssertEquals(duration("0.9s")) + lt = lt.Rewind(clock.Now()) a.Do(state.UpdatedVM, helpers.CreateVmInfo( DefaultInitialStateConfig.VM, helpers.WithCurrentCU(1), helpers.WithMinMaxCU(1, 1), - helpers.WithLogicalTime(lt.Rewind(clock.Now())), + helpers.WithLogicalTime(lt), )) // One more latency observation From 96890c85ba7ad9d06999a9c60175b8e7764724d3 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 11 Jul 2024 14:39:53 +0400 Subject: [PATCH 22/57] fix lint Signed-off-by: Oleg Vasilev --- pkg/agent/core/state_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 0fb09b66b..c5c844ca9 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -2,11 +2,11 @@ package core_test import ( "fmt" - "github.com/stretchr/testify/assert" "testing" "time" "github.com/samber/lo" + "github.com/stretchr/testify/assert" "go.uber.org/zap" "golang.org/x/exp/slices" From 896d0df840a60135ea8144f32be7fae2f1479420 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 11 Jul 2024 15:13:10 +0400 Subject: [PATCH 23/57] tmp: no dedicated .UpdateLogicalTime for Plugin Signed-off-by: Oleg Vasilev --- pkg/agent/core/state.go | 6 ++--- pkg/agent/core/state_test.go | 40 ++++++++++++++-------------- pkg/agent/core/testhelpers/assert.go | 18 +++++++++++++ pkg/agent/executor/exec_plugin.go | 3 +-- 4 files changed, 41 insertions(+), 26 deletions(-) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index aa528265f..0868a6c22 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -1062,6 +1062,7 @@ func (h PluginHandle) RequestFailed(now time.Time) { func (h PluginHandle) RequestSuccessful( now time.Time, + desiredTime *vmv1.LogicalTime, resp api.PluginResponse, ) (_err error) { h.s.Plugin.OngoingRequest = false @@ -1095,13 +1096,10 @@ func (h PluginHandle) RequestSuccessful( // the process of moving the source of truth for ComputeUnit from the scheduler plugin to the // autoscaler-agent. h.s.Plugin.Permit = &resp.Permit + h.s.Plugin.CurrentLogicalTime = desiredTime.Rewind(now) return nil } -func (h PluginHandle) UpdateLogicalTime(currentTime *vmv1.LogicalTime) { - h.s.Plugin.CurrentLogicalTime = currentTime -} - // MonitorHandle provides write access to the vm-monitor pieces of an UpdateState type MonitorHandle struct { s *state diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index c5c844ca9..2b73f678f 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -19,6 +19,8 @@ import ( "github.com/neondatabase/autoscaling/pkg/api" ) +var NilLogicalTime = helpers.SafeVal[vmv1.LogicalTime](nil) + func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { slotSize := api.Bytes(1 << 30 /* 1 Gi */) @@ -146,7 +148,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { // set lastApproved by simulating a scheduler request/response state.Plugin().StartingRequest(now, c.schedulerApproved) - err := state.Plugin().RequestSuccessful(now, api.PluginResponse{ + err := state.Plugin().RequestSuccessful(now, nil, api.PluginResponse{ Permit: c.schedulerApproved, Migrate: nil, }) @@ -229,7 +231,7 @@ func doInitialPluginRequest(a helpers.Assert, state *core.State, clock *helpers. }) a.Do(state.Plugin().StartingRequest, clock.Now(), resources) clock.Inc(requestTime) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resources, Migrate: nil, }) @@ -308,12 +310,11 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { clockTick().AssertEquals(duration("0.3s")) // should have nothing more to do; waiting on plugin request to come back a.Call(nextActions).Equals(core.ActionSet{}) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), lt, api.PluginResponse{ Permit: resForCU(2), Migrate: nil, }) lt = lt.Rewind(clock.Now()) - state.Plugin().UpdateLogicalTime(lt) // Scheduler approval is done, now we should be making the request to NeonVM a.Call(nextActions).Equals(core.ActionSet{ @@ -459,11 +460,10 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { clockTick().AssertEquals(duration("1s")) // should have nothing more to do; waiting on plugin request to come back a.Call(nextActions).Equals(core.ActionSet{}) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), lt, api.PluginResponse{ Permit: resForCU(1), Migrate: nil, }) - state.Plugin().UpdateLogicalTime(lt.Rewind(clock.Now())) // Finally, check there's no leftover actions: a.Call(nextActions).Equals(core.ActionSet{ @@ -523,7 +523,7 @@ func TestPeriodicPluginRequest(t *testing.T) { a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{}) clock.Inc(reqDuration) a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{}) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resources, Migrate: nil, }) @@ -683,7 +683,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { Wait: &core.ActionWait{Duration: duration("3.9s")}, }) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resForCU(3), Migrate: nil, }) @@ -728,7 +728,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { Wait: &core.ActionWait{Duration: duration("1s")}, // still waiting on retrying vm-monitor downscaling }) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resForCU(3), Migrate: nil, }) @@ -798,7 +798,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { // not waiting on anything! }) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resForCU(1), Migrate: nil, }) @@ -864,7 +864,7 @@ func TestRequestedUpscale(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("5.9s")}, // same waiting for requested upscale expiring }) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resForCU(2), Migrate: nil, }) @@ -918,7 +918,7 @@ func TestRequestedUpscale(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("0.9s")}, // waiting for requested upscale expiring }) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resForCU(2), Migrate: nil, }) @@ -1067,7 +1067,7 @@ func TestDownscalePivotBack(t *testing.T) { halfClockTick() *pluginWait = duration("4.9s") // reset because we just made a request t.Log(" > finish plugin downscale") - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resForCU(1), Migrate: nil, }) @@ -1086,7 +1086,7 @@ func TestDownscalePivotBack(t *testing.T) { clockTick() *pluginWait = duration("4.9s") // reset because we just made a request t.Log(" > finish plugin upscale") - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resForCU(2), Migrate: nil, }) @@ -1228,7 +1228,7 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resForCU(1), Migrate: nil, }) @@ -1299,7 +1299,7 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) { }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resForCU(3), Migrate: nil, }) @@ -1401,7 +1401,7 @@ func TestFailedRequestRetry(t *testing.T) { }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resForCU(2), Migrate: nil, }) @@ -1489,7 +1489,7 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resForCU(3), Migrate: nil, }) @@ -1587,7 +1587,7 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resForCU(2), Migrate: nil, }) @@ -1613,7 +1613,7 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ Permit: resForCU(1), Migrate: nil, }) diff --git a/pkg/agent/core/testhelpers/assert.go b/pkg/agent/core/testhelpers/assert.go index 2fb5bf288..23d460eef 100644 --- a/pkg/agent/core/testhelpers/assert.go +++ b/pkg/agent/core/testhelpers/assert.go @@ -60,6 +60,19 @@ func (a Assert) NoError(f any, args ...any) { a.Call(f, args...).Equals(nil) } +// SafeVal creates a safe value that can be used in Assert.Call() call. +// +// We have to use this function because calling the Assert.Call() method with a +// nil parameter can cause a panic like: +// panic: reflect: Call using zero Value argument... +func SafeVal[T any](i any) (v reflect.Value) { + v = reflect.ValueOf(i) + if i == nil { + v = reflect.Zero(reflect.TypeOf((*T)(nil))) + } + return +} + // Call sets up a prepared function call, which will not be executed until one of its methods is // actually called, which will perform all the relevant checks. // @@ -79,6 +92,11 @@ func (a Assert) Call(f any, args ...any) PreparedFunctionCall { var argValues []reflect.Value for _, a := range args { + if _, ok := a.(reflect.Value); ok { + // This is a SafeVal value, so we can just use it directly + argValues = append(argValues, a.(reflect.Value)) + continue + } argValues = append(argValues, reflect.ValueOf(a)) } diff --git a/pkg/agent/executor/exec_plugin.go b/pkg/agent/executor/exec_plugin.go index 2f4777d3f..d05a88e1b 100644 --- a/pkg/agent/executor/exec_plugin.go +++ b/pkg/agent/executor/exec_plugin.go @@ -61,10 +61,9 @@ func (c *ExecutorCoreWithClients) DoPluginRequests(ctx context.Context, logger * } else { logFields = append(logFields, zap.Any("response", resp)) logger.Info("Plugin request successful", logFields...) - if err := state.Plugin().RequestSuccessful(endTime, *resp); err != nil { + if err := state.Plugin().RequestSuccessful(endTime, action.DesiredLogicalTime, *resp); err != nil { logger.Error("Plugin response validation failed", append(logFields, zap.Error(err))...) } - state.Plugin().UpdateLogicalTime(action.DesiredLogicalTime.Rewind(endTime)) } }) } From ce1dbb31fb1c3cbeb3e6701f77588f6b3cf2a772 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 11 Jul 2024 15:19:08 +0400 Subject: [PATCH 24/57] revert unused param Signed-off-by: Oleg Vasilev --- pkg/agent/core/state_test.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 2b73f678f..f019e4c54 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -220,7 +220,7 @@ func logicalTime(clock *helpers.FakeClock, value int64) *vmv1.LogicalTime { } } -func doInitialPluginRequest(a helpers.Assert, state *core.State, clock *helpers.FakeClock, requestTime time.Duration, metrics *api.Metrics, resources api.Resources, _ bool) { +func doInitialPluginRequest(a helpers.Assert, state *core.State, clock *helpers.FakeClock, requestTime time.Duration, metrics *api.Metrics, resources api.Resources) { a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ LastPermit: nil, @@ -280,7 +280,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { state.Monitor().Active(true) // Send initial scheduler request: - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1), true) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) // Set metrics clockTick().AssertEquals(duration("0.2s")) @@ -499,7 +499,7 @@ func TestPeriodicPluginRequest(t *testing.T) { reqEvery := DefaultInitialStateConfig.Core.PluginRequestTick endTime := duration("20s") - doInitialPluginRequest(a, state, clock, clockTick, lo.ToPtr(metrics.ToAPI()), resources, false) + doInitialPluginRequest(a, state, clock, clockTick, lo.ToPtr(metrics.ToAPI()), resources) for clock.Elapsed().Duration < endTime { timeSinceScheduledRequest := (clock.Elapsed().Duration - base) % reqEvery @@ -561,7 +561,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { state.Monitor().Active(true) - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(6), false) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(6)) // Set metrics clockTick() @@ -832,7 +832,7 @@ func TestRequestedUpscale(t *testing.T) { state.Monitor().Active(true) // Send initial scheduler request: - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1), false) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) // Set metrics clockTick() @@ -1108,7 +1108,7 @@ func TestDownscalePivotBack(t *testing.T) { state.Monitor().Active(true) - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2), false) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) clockTick().AssertEquals(duration("0.2s")) pluginWait := duration("4.8s") @@ -1162,7 +1162,7 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { state.Monitor().Active(true) // Send initial scheduler request: - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2), false) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) clockTick() @@ -1261,7 +1261,7 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) { state.Monitor().Active(true) // Send initial scheduler request: - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2), false) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) clockTick() @@ -1360,7 +1360,7 @@ func TestFailedRequestRetry(t *testing.T) { state.Monitor().Active(true) // Send initial scheduler request - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1), false) + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) // Set metrics so that we should be trying to upscale clockTick() From 358f6c786eb6ef4c3d7bfedecc80c569419ad6e6 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Fri, 12 Jul 2024 22:15:48 +0400 Subject: [PATCH 25/57] rename logic clock to revisions Signed-off-by: Oleg Vasilev --- neonvm/apis/neonvm/v1/virtualmachine_types.go | 93 +- .../v1/virtualmachinemigration_types.go | 2 +- .../apis/neonvm/v1/zz_generated.deepcopy.go | 60 +- .../bases/vm.neon.tech_virtualmachines.yaml | 59 +- neonvm/controllers/vm_controller.go | 5 +- pkg/agent/core/action.go | 26 +- pkg/agent/core/dumpstate.go | 16 +- pkg/agent/core/logiclock/logiclock.go | 122 - pkg/agent/core/logiclock/logiclock_test.go | 129 - pkg/agent/core/revsource/revsource.go | 99 + pkg/agent/core/revsource/revsource_test.go | 110 + pkg/agent/core/state.go | 131 +- pkg/agent/core/state_test.go | 2435 ++++++++--------- pkg/agent/core/testhelpers/construct.go | 34 +- pkg/agent/execbridge.go | 4 +- pkg/agent/executor/core.go | 4 +- pkg/agent/executor/exec_monitor.go | 6 +- pkg/agent/executor/exec_neonvm.go | 6 +- pkg/agent/executor/exec_plugin.go | 2 +- pkg/agent/prommetrics.go | 4 +- pkg/agent/runner.go | 13 +- pkg/api/vminfo.go | 16 +- 22 files changed, 1684 insertions(+), 1692 deletions(-) delete mode 100644 pkg/agent/core/logiclock/logiclock.go delete mode 100644 pkg/agent/core/logiclock/logiclock_test.go create mode 100644 pkg/agent/core/revsource/revsource.go create mode 100644 pkg/agent/core/revsource/revsource_test.go diff --git a/neonvm/apis/neonvm/v1/virtualmachine_types.go b/neonvm/apis/neonvm/v1/virtualmachine_types.go index ca429277a..89c5c9fe0 100644 --- a/neonvm/apis/neonvm/v1/virtualmachine_types.go +++ b/neonvm/apis/neonvm/v1/virtualmachine_types.go @@ -142,9 +142,14 @@ type VirtualMachineSpec struct { // +optional EnableSSH *bool `json:"enableSSH,omitempty"` - // Logical timestamp corresponding to the desired resources of the VM. + // TargetRevision is the identifier set by external party to track when changes to the spec + // propagate to the VM. + // + // If a certain value is written into Spec.TargetRevision together with the changes, and + // the same value is observed in Status.CurrentRevision, it means that the changes have + // propagated to the VM. // +optional - DesiredLogicalTime *LogicalTime `json:"desiredLogicalTime,omitempty"` + TargetRevision *RevisionWithTime `json:"desiredLogicalTime,omitempty"` } func (spec *VirtualMachineSpec) Resources() VirtualMachineResources { @@ -221,44 +226,70 @@ func (g Guest) ValidateForMemoryProvider(p MemoryProvider) error { return nil } -// LogicalTime allows to track progress of changes to a VM. -type LogicalTime struct { - Value int64 `json:"value"` - UpdatedAt metav1.Time `json:"updatedAt"` +type Flag uint64 + +func (f *Flag) Set(flag Flag) { + *f |= flag +} + +func (f *Flag) Clear(flag Flag) { + *f &= ^flag +} + +func (f Flag) Has(flag Flag) bool { + return f&flag != 0 +} + +// Revision allows to assign an identifier to a configuration of a VM. +// Later it can be used to track the application of the configuration. +type Revision struct { + Value int64 `json:"value"` + Flags Flag `json:"flags"` +} + +var ZeroRevision = Revision{Value: 0, Flags: 0} + +func (r Revision) Min(other Revision) Revision { + if r.Value < other.Value { + return r + } + return other +} + +func (r Revision) WithTime(t time.Time) RevisionWithTime { + return RevisionWithTime{ + Revision: r, + UpdatedAt: metav1.NewTime(t), + } } // MarshalLogObject implements zapcore.ObjectMarshaler, so that LogicalTime can be used with zap.Object -func (r *LogicalTime) MarshalLogObject(enc zapcore.ObjectEncoder) error { +func (r *Revision) MarshalLogObject(enc zapcore.ObjectEncoder) error { enc.AddInt64("value", r.Value) - enc.AddTime("updatedAt", r.UpdatedAt.Time) + enc.AddUint64("flags", uint64(r.Flags)) return nil } -func (t *LogicalTime) Rewind(now time.Time) *LogicalTime { - if t == nil { - return nil - } - return &LogicalTime{ - Value: t.Value, - UpdatedAt: metav1.NewTime(now), - } +// RevisionWithTime contains a Revision and the time it was last updated. +type RevisionWithTime struct { + Revision `json:"revision"` + UpdatedAt metav1.Time `json:"updatedAt"` } -func (t *LogicalTime) RewindNow() *LogicalTime { - return t.Rewind(time.Now()) +// MarshalLogObject implements zapcore.ObjectMarshaler, so that LogicalTime can be used with zap.Object +func (r *RevisionWithTime) MarshalLogObject(enc zapcore.ObjectEncoder) error { + enc.AddInt64("rev", r.Revision.Value) + enc.AddTime("updatedAt", r.UpdatedAt.Time) + return nil } -func (t *LogicalTime) Earliest(other *LogicalTime) *LogicalTime { - if t == nil { - return other - } - if other == nil { - return t - } - if t.Value < other.Value { - return t - } - return other +func (r *RevisionWithTime) Update(now time.Time, rev Revision) { + r.Revision = rev + r.UpdatedAt = metav1.NewTime(now) +} + +func (t *RevisionWithTime) UpdateNow(rev Revision) { + t.Update(time.Now(), rev) } type GuestSettings struct { @@ -548,7 +579,7 @@ type VirtualMachineStatus struct { // Represents the observations of a VirtualMachine's current state. // VirtualMachine.status.conditions.type are: "Available", "Progressing", and "Degraded" // VirtualMachine.status.conditions.status are one of True, False, Unknown. - // VirtualMachine.status.conditions.reason the value should be a CamelCase string and producers of specific + // VirtualMachine.status.conditions.reason the Value should be a CamelCase string and producers of specific // condition types may define expected values and meanings for this field, and whether the values // are considered a guaranteed API. // VirtualMachine.status.conditions.Message is a human readable message indicating details about the transition. @@ -581,7 +612,7 @@ type VirtualMachineStatus struct { // +optional SSHSecretName string `json:"sshSecretName,omitempty"` // +optional - CurrentLogicalTime *LogicalTime `json:"currentLogicalTime,omitempty"` + CurrentRevision *RevisionWithTime `json:"currentRevision,omitempty"` } type VmPhase string diff --git a/neonvm/apis/neonvm/v1/virtualmachinemigration_types.go b/neonvm/apis/neonvm/v1/virtualmachinemigration_types.go index 3de246363..d909e7fcf 100644 --- a/neonvm/apis/neonvm/v1/virtualmachinemigration_types.go +++ b/neonvm/apis/neonvm/v1/virtualmachinemigration_types.go @@ -73,7 +73,7 @@ type VirtualMachineMigrationStatus struct { // Represents the observations of a VirtualMachineMigration's current state. // VirtualMachineMigration.status.conditions.type are: "Available", "Progressing", and "Degraded" // VirtualMachineMigration.status.conditions.status are one of True, False, Unknown. - // VirtualMachineMigration.status.conditions.reason the value should be a CamelCase string and producers of specific + // VirtualMachineMigration.status.conditions.reason the Value should be a CamelCase string and producers of specific // condition types may define expected values and meanings for this field, and whether the values // are considered a guaranteed API. // VirtualMachineMigration.status.conditions.Message is a human readable message indicating details about the transition. diff --git a/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go b/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go index e78d5b5d0..3821c56e3 100644 --- a/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go +++ b/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go @@ -328,22 +328,6 @@ func (in *IPPoolSpec) DeepCopy() *IPPoolSpec { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *LogicalTime) DeepCopyInto(out *LogicalTime) { - *out = *in - in.UpdatedAt.DeepCopyInto(&out.UpdatedAt) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LogicalTime. -func (in *LogicalTime) DeepCopy() *LogicalTime { - if in == nil { - return nil - } - out := new(LogicalTime) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MemorySlots) DeepCopyInto(out *MemorySlots) { *out = *in @@ -421,6 +405,38 @@ func (in *Port) DeepCopy() *Port { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Revision) DeepCopyInto(out *Revision) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Revision. +func (in *Revision) DeepCopy() *Revision { + if in == nil { + return nil + } + out := new(Revision) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RevisionWithTime) DeepCopyInto(out *RevisionWithTime) { + *out = *in + out.Revision = in.Revision + in.UpdatedAt.DeepCopyInto(&out.UpdatedAt) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RevisionWithTime. +func (in *RevisionWithTime) DeepCopy() *RevisionWithTime { + if in == nil { + return nil + } + out := new(RevisionWithTime) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RootDisk) DeepCopyInto(out *RootDisk) { *out = *in @@ -739,9 +755,9 @@ func (in *VirtualMachineSpec) DeepCopyInto(out *VirtualMachineSpec) { *out = new(bool) **out = **in } - if in.DesiredLogicalTime != nil { - in, out := &in.DesiredLogicalTime, &out.DesiredLogicalTime - *out = new(LogicalTime) + if in.TargetRevision != nil { + in, out := &in.TargetRevision, &out.TargetRevision + *out = new(RevisionWithTime) (*in).DeepCopyInto(*out) } } @@ -781,9 +797,9 @@ func (in *VirtualMachineStatus) DeepCopyInto(out *VirtualMachineStatus) { *out = new(MemoryProvider) **out = **in } - if in.CurrentLogicalTime != nil { - in, out := &in.CurrentLogicalTime, &out.CurrentLogicalTime - *out = new(LogicalTime) + if in.CurrentRevision != nil { + in, out := &in.CurrentRevision, &out.CurrentRevision + *out = new(RevisionWithTime) (*in).DeepCopyInto(*out) } } diff --git a/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml b/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml index 96105a1cf..ddade3d2b 100644 --- a/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml +++ b/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml @@ -893,18 +893,33 @@ spec: type: object type: object desiredLogicalTime: - description: Logical timestamp corresponding to the desired resources - of the VM. + description: "TargetRevision is the identifier set by external party + to track when changes to the spec propagate to the VM. \n If a certain + value is written into Spec.TargetRevision together with the changes, + and the same value is observed in Status.CurrentRevision, it means + that the changes have propagated to the VM." properties: + revision: + description: Revision allows to assign an identifier to a configuration + of a VM. Later it can be used to track the application of the + configuration. + properties: + flags: + format: int64 + type: integer + value: + format: int64 + type: integer + required: + - flags + - value + type: object updatedAt: format: date-time type: string - value: - format: int64 - type: integer required: + - revision - updatedAt - - value type: object disks: description: List of disk that can be mounted by virtual machine. @@ -2430,13 +2445,13 @@ spec: process. items: properties: + Value: + default: "" + type: string name: description: Name of the environment variable. Must be a C_IDENTIFIER. type: string - value: - default: "" - type: string required: - name type: object @@ -2800,19 +2815,31 @@ spec: pattern: ^[0-9]+((\.[0-9]*)?|m) type: integer x-kubernetes-int-or-string: true - currentLogicalTime: - description: LogicalTime allows to track progress of changes to a - VM. + currentRevision: + description: RevisionWithTime contains a Revision and the time it + was last updated. properties: + revision: + description: Revision allows to assign an identifier to a configuration + of a VM. Later it can be used to track the application of the + configuration. + properties: + flags: + format: int64 + type: integer + value: + format: int64 + type: integer + required: + - flags + - value + type: object updatedAt: format: date-time type: string - value: - format: int64 - type: integer required: + - revision - updatedAt - - value type: object extraNetIP: type: string diff --git a/neonvm/controllers/vm_controller.go b/neonvm/controllers/vm_controller.go index 5c1a356e0..7a4962714 100644 --- a/neonvm/controllers/vm_controller.go +++ b/neonvm/controllers/vm_controller.go @@ -801,7 +801,10 @@ func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine) } if vm.Status.Phase == vmv1.VmRunning { - vm.Status.CurrentLogicalTime = vm.Spec.DesiredLogicalTime.RewindNow() + if vm.Spec.TargetRevision != nil { + rev := vm.Spec.TargetRevision.WithTime(time.Now()) + vm.Status.CurrentRevision = &rev + } } return nil diff --git a/pkg/agent/core/action.go b/pkg/agent/core/action.go index 294b94a0f..c7be28f31 100644 --- a/pkg/agent/core/action.go +++ b/pkg/agent/core/action.go @@ -22,28 +22,28 @@ type ActionWait struct { } type ActionPluginRequest struct { - LastPermit *api.Resources `json:"current"` - Target api.Resources `json:"target"` - Metrics *api.Metrics `json:"metrics"` - DesiredLogicalTime *vmv1.LogicalTime `json:"desiredLogicalTime"` + LastPermit *api.Resources `json:"current"` + Target api.Resources `json:"target"` + Metrics *api.Metrics `json:"metrics"` + TargetRevision vmv1.RevisionWithTime `json:"targetRevision"` } type ActionNeonVMRequest struct { - Current api.Resources `json:"current"` - Target api.Resources `json:"target"` - DesiredLogicalTime *vmv1.LogicalTime `json:"desiredLogicalTime"` + Current api.Resources `json:"current"` + Target api.Resources `json:"target"` + TargetRevision vmv1.RevisionWithTime `json:"targetRevision"` } type ActionMonitorDownscale struct { - Current api.Resources `json:"current"` - Target api.Resources `json:"target"` - DesiredLogicalTime *vmv1.LogicalTime `json:"desiredLogicalTime"` + Current api.Resources `json:"current"` + Target api.Resources `json:"target"` + TargetRevision vmv1.RevisionWithTime `json:"targetRevision"` } type ActionMonitorUpscale struct { - Current api.Resources `json:"current"` - Target api.Resources `json:"target"` - DesiredLogicalTime *vmv1.LogicalTime `json:"desiredLogicalTime"` + Current api.Resources `json:"current"` + Target api.Resources `json:"target"` + TargetRevision vmv1.RevisionWithTime `json:"targetRevision"` } func addObjectPtr[T zapcore.ObjectMarshaler](enc zapcore.ObjectEncoder, key string, value *T) error { diff --git a/pkg/agent/core/dumpstate.go b/pkg/agent/core/dumpstate.go index fdd74b2e6..7b3812135 100644 --- a/pkg/agent/core/dumpstate.go +++ b/pkg/agent/core/dumpstate.go @@ -6,7 +6,6 @@ import ( "encoding/json" "time" - vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "github.com/neondatabase/autoscaling/pkg/api" ) @@ -41,8 +40,7 @@ func (s *State) Dump() StateDump { Monitor: s.internal.Monitor.deepCopy(), NeonVM: s.internal.NeonVM.deepCopy(), Metrics: shallowCopy[SystemMetrics](s.internal.Metrics), - ClockSource: s.internal.ClockSource, - DesiredLogicalTime: s.internal.DesiredLogicalTime, + TargetRevision: s.internal.TargetRevision, LastDesiredResources: s.internal.LastDesiredResources, }, } @@ -50,11 +48,11 @@ func (s *State) Dump() StateDump { func (s *pluginState) deepCopy() pluginState { return pluginState{ - OngoingRequest: s.OngoingRequest, - LastRequest: shallowCopy[pluginRequested](s.LastRequest), - LastFailureAt: shallowCopy[time.Time](s.LastFailureAt), - Permit: shallowCopy[api.Resources](s.Permit), - CurrentLogicalTime: shallowCopy[vmv1.LogicalTime](s.CurrentLogicalTime), + OngoingRequest: s.OngoingRequest, + LastRequest: shallowCopy[pluginRequested](s.LastRequest), + LastFailureAt: shallowCopy[time.Time](s.LastFailureAt), + Permit: shallowCopy[api.Resources](s.Permit), + CurrentRevision: s.CurrentRevision, } } @@ -66,7 +64,7 @@ func (s *monitorState) deepCopy() monitorState { Approved: shallowCopy[api.Resources](s.Approved), DownscaleFailureAt: shallowCopy[time.Time](s.DownscaleFailureAt), UpscaleFailureAt: shallowCopy[time.Time](s.UpscaleFailureAt), - CurrentLogicalTime: shallowCopy[vmv1.LogicalTime](s.CurrentLogicalTime), + CurrentRevision: s.CurrentRevision, } } diff --git a/pkg/agent/core/logiclock/logiclock.go b/pkg/agent/core/logiclock/logiclock.go deleted file mode 100644 index 935691c6f..000000000 --- a/pkg/agent/core/logiclock/logiclock.go +++ /dev/null @@ -1,122 +0,0 @@ -package logiclock - -import ( - "errors" - "time" - - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" -) - -// Flag is a set of flags that can be associated with a logical timestamp. -type Flag uint64 - -const ( - Upscale Flag = 1 << iota - Downscale - Immediate -) - -func (f *Flag) Set(flag Flag) { - *f |= flag -} - -func (f *Flag) Clear(flag Flag) { - *f &= ^flag -} - -func (f Flag) Has(flag Flag) bool { - return f&flag != 0 -} - -// AllFlags and AllFlagNames must have the same order, so the metrics work correctly. -var AllFlags = []Flag{Upscale, Downscale, Immediate} -var AllFlagNames = []string{"upscale", "downscale", "immediate"} - -// FlagsToLabels converts a set of flags to a list of strings which prometheus can take. -func FlagsToLabels(flags Flag) []string { - var ret []string - for _, flag := range AllFlags { - value := "false" - if flags.Has(flag) { - value = "true" - } - ret = append(ret, value) - } - return ret -} - -// Clock can generate and observe logical time. -// Each logical timestamp is associated with a physical timestamp and a set of flags upon creation. -// Once Clock observes a previously generated timestamp after some time, it will call the callback with -// the time difference and the flags associated with the timestamp. -type Clock struct { - cb func(time.Duration, Flag) - - // The in-flight timestamps are stored in-order. - // After the timestamp is observed, it is removed from the measurements, and the offset is increased. - measurements []measurement - offset int64 -} - -type measurement struct { - createdAt time.Time - flags Flag -} - -func NewClock(cb func(time.Duration, Flag)) *Clock { - return &Clock{ - cb: cb, - measurements: nil, - offset: 0, - } -} - -func (c *Clock) nextValue() int64 { - return c.offset + int64(len(c.measurements)) -} - -func (c *Clock) Next(now time.Time, flags Flag) *vmv1.LogicalTime { - ret := vmv1.LogicalTime{ - Value: c.nextValue(), - UpdatedAt: v1.NewTime(now), - } - c.measurements = append(c.measurements, measurement{ - createdAt: ret.UpdatedAt.Time, - flags: flags, - }) - return &ret -} - -func (c *Clock) Observe(logicalTime *vmv1.LogicalTime) error { - if logicalTime == nil { - return nil - } - if logicalTime.Value < c.offset { - // Already observed - return nil - } - - idx := logicalTime.Value - c.offset - if idx > int64(len(c.measurements)) { - return errors.New("logicalTime value is in the future") - } - - diff := logicalTime.UpdatedAt.Time.Sub(c.measurements[idx].createdAt) - - if c.cb != nil { - c.cb(diff, c.measurements[idx].flags) - } - - // Forget the measurement, and all the measurements before it. - c.offset = logicalTime.Value + 1 - c.measurements = c.measurements[idx+1:] - - return nil -} - -type NilClock struct{} - -func (c *NilClock) Next(_ time.Time, _ Flag) *vmv1.LogicalTime { return nil } -func (c *NilClock) Observe(_ *vmv1.LogicalTime) error { return nil } diff --git a/pkg/agent/core/logiclock/logiclock_test.go b/pkg/agent/core/logiclock/logiclock_test.go deleted file mode 100644 index d5e82c899..000000000 --- a/pkg/agent/core/logiclock/logiclock_test.go +++ /dev/null @@ -1,129 +0,0 @@ -package logiclock_test - -import ( - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" - "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" -) - -type testClockMetric struct { - *logiclock.Clock - t *testing.T - now v1.Time - result *time.Duration - resultKind *logiclock.Flag -} - -func (tcm *testClockMetric) advance(d time.Duration) { - tcm.now = v1.NewTime(tcm.now.Add(d)) -} - -func (tcm *testClockMetric) assertResult(d time.Duration) { - require.NotNil(tcm.t, tcm.result) - assert.Equal(tcm.t, d, *tcm.result) - tcm.result = nil -} - -func (tcm *testClockMetric) nextNow() *vmv1.LogicalTime { - return tcm.Next(tcm.now.Time, logiclock.Upscale) -} - -func newTestClockMetric(t *testing.T) *testClockMetric { - tcm := &testClockMetric{ - Clock: nil, - t: t, - now: v1.NewTime(time.Now()), - result: nil, - resultKind: nil, - } - - cb := func(d time.Duration, kind logiclock.Flag) { - tcm.result = &d - tcm.resultKind = &kind - } - tcm.Clock = logiclock.NewClock(cb) - - return tcm -} - -func TestClockMetric(t *testing.T) { - tcm := newTestClockMetric(t) - - // Generate new clock - cl := tcm.nextNow() - assert.Equal(t, int64(0), cl.Value) - - // Observe it coming back in 5 seconds - tcm.advance(5 * time.Second) - err := tcm.Observe(&vmv1.LogicalTime{ - Value: 0, - UpdatedAt: tcm.now, - }) - assert.NoError(t, err) - tcm.assertResult(5 * time.Second) -} - -func TestClockMetricSkip(t *testing.T) { - tcm := newTestClockMetric(t) - - // Generate new clock - cl := tcm.nextNow() - assert.Equal(t, int64(0), cl.Value) - - // Generate another one - tcm.advance(5 * time.Second) - cl = tcm.nextNow() - assert.Equal(t, int64(1), cl.Value) - - // Observe the first one - tcm.advance(5 * time.Second) - err := tcm.Observe(&vmv1.LogicalTime{ - Value: 0, - UpdatedAt: tcm.now, - }) - assert.NoError(t, err) - tcm.assertResult(10 * time.Second) - - // Observe the second one - tcm.advance(2 * time.Second) - err = tcm.Observe(&vmv1.LogicalTime{ - Value: 1, - UpdatedAt: tcm.now, - }) - assert.NoError(t, err) - tcm.assertResult(7 * time.Second) -} - -func TestClockMetricStale(t *testing.T) { - tcm := newTestClockMetric(t) - - // Generate new clock - cl := tcm.nextNow() - assert.Equal(t, int64(0), cl.Value) - - // Observe it coming back in 5 seconds - tcm.advance(5 * time.Second) - err := tcm.Observe(&vmv1.LogicalTime{ - Value: 0, - UpdatedAt: tcm.now, - }) - assert.NoError(t, err) - tcm.assertResult(5 * time.Second) - - // Observe it coming back again - tcm.advance(5 * time.Second) - err = tcm.Observe(&vmv1.LogicalTime{ - Value: 0, - UpdatedAt: tcm.now, - }) - // No error, but no result either - assert.NoError(t, err) - assert.Nil(t, tcm.result) -} diff --git a/pkg/agent/core/revsource/revsource.go b/pkg/agent/core/revsource/revsource.go new file mode 100644 index 000000000..537826581 --- /dev/null +++ b/pkg/agent/core/revsource/revsource.go @@ -0,0 +1,99 @@ +package revsource + +import ( + "errors" + "time" + + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" +) + +const ( + Upscale vmv1.Flag = 1 << iota + Downscale + Immediate +) + +// AllFlags and AllFlagNames must have the same order, so the metrics work correctly. +var AllFlags = []vmv1.Flag{Upscale, Downscale, Immediate} +var AllFlagNames = []string{"upscale", "downscale", "immediate"} + +// FlagsToLabels converts a set of flags to a list of strings which prometheus can take. +func FlagsToLabels(flags vmv1.Flag) []string { + var ret []string + for _, flag := range AllFlags { + value := "false" + if flags.Has(flag) { + value = "true" + } + ret = append(ret, value) + } + return ret +} + +// RevisionSource can generate and observe logical time. +// Each logical timestamp is associated with a physical timestamp and a set of flags upon creation. +// Once RevisionSource observes a previously generated timestamp after some time, it will call the callback with +// the time difference and the flags associated with the timestamp. +type RevisionSource struct { + cb func(time.Duration, vmv1.Flag) + + // The in-flight timestamps are stored in-order. + // After the timestamp is observed, it is removed from the measurements, and the offset is increased. + measurements []time.Time + offset int64 +} + +func NewRevisionSource(cb func(time.Duration, vmv1.Flag)) *RevisionSource { + return &RevisionSource{ + cb: cb, + measurements: nil, + offset: 1, // Start with 1, 0 is reserved for default value. + } +} + +func (c *RevisionSource) nextValue() int64 { + return c.offset + int64(len(c.measurements)) +} + +func (c *RevisionSource) Next(now time.Time) vmv1.Revision { + ret := vmv1.Revision{ + Value: c.nextValue(), + Flags: 0, + } + c.measurements = append(c.measurements, now) + return ret +} + +func (c *RevisionSource) Observe(moment time.Time, rev vmv1.Revision) error { + if rev.Value < c.offset { + // Already observed + return nil + } + + idx := rev.Value - c.offset + if idx > int64(len(c.measurements)) { + return errors.New("revision is in the future") + } + + diff := moment.Sub(c.measurements[idx]) + + if c.cb != nil { + c.cb(diff, rev.Flags) + } + + // Forget the measurement, and all the measurements before it. + c.offset = rev.Value + 1 + c.measurements = c.measurements[idx+1:] + + return nil +} + +type NilRevisionSource struct{} + +func (c *NilRevisionSource) Next(_ time.Time) vmv1.Revision { + return vmv1.Revision{ + Value: 0, + Flags: 0, + } +} +func (c *NilRevisionSource) Observe(_ time.Time, _ vmv1.Revision) error { return nil } diff --git a/pkg/agent/core/revsource/revsource_test.go b/pkg/agent/core/revsource/revsource_test.go new file mode 100644 index 000000000..75d66f49e --- /dev/null +++ b/pkg/agent/core/revsource/revsource_test.go @@ -0,0 +1,110 @@ +package revsource_test + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" + "github.com/neondatabase/autoscaling/pkg/agent/core/revsource" +) + +type testRevisionSource struct { + *revsource.RevisionSource + t *testing.T + now v1.Time + result *time.Duration + resultFlags *vmv1.Flag +} + +func (trs *testRevisionSource) advance(d time.Duration) { + trs.now = v1.NewTime(trs.now.Add(d)) +} + +func (trs *testRevisionSource) assertResult(d time.Duration) { + require.NotNil(trs.t, trs.result) + assert.Equal(trs.t, d, *trs.result) + trs.result = nil +} + +func newTestRevisionSource(t *testing.T) *testRevisionSource { + tcm := &testRevisionSource{ + RevisionSource: nil, + t: t, + now: v1.NewTime(time.Now()), + result: nil, + resultFlags: nil, + } + + cb := func(d time.Duration, flags vmv1.Flag) { + tcm.result = &d + tcm.resultFlags = &flags + } + tcm.RevisionSource = revsource.NewRevisionSource(cb) + + return tcm +} + +func TestRevSource(t *testing.T) { + trs := newTestRevisionSource(t) + + // Generate new revision + rev := trs.Next(trs.now.Time) + assert.Equal(t, int64(0), rev.Value) + + // Observe it coming back in 5 seconds + trs.advance(5 * time.Second) + err := trs.Observe(trs.now.Time, rev) + assert.NoError(t, err) + trs.assertResult(5 * time.Second) +} + +func TestRevSourceSkip(t *testing.T) { + trs := newTestRevisionSource(t) + + // Generate new clock + rev1 := trs.Next(trs.now.Time) + assert.Equal(t, int64(0), rev1.Value) + + // Generate another one + trs.advance(5 * time.Second) + rev2 := trs.Next(trs.now.Time) + assert.Equal(t, int64(1), rev2.Value) + + // Observe the first one + trs.advance(5 * time.Second) + err := trs.Observe(trs.now.Time, rev1) + assert.NoError(t, err) + trs.assertResult(10 * time.Second) + + // Observe the second one + trs.advance(2 * time.Second) + err = trs.Observe(trs.now.Time, rev2) + assert.NoError(t, err) + trs.assertResult(7 * time.Second) +} + +func TestStale(t *testing.T) { + trs := newTestRevisionSource(t) + + // Generate new clock + cl := trs.Next(trs.now.Time) + assert.Equal(t, int64(0), cl.Value) + + // Observe it coming back in 5 seconds + trs.advance(5 * time.Second) + err := trs.Observe(trs.now.Time, cl) + assert.NoError(t, err) + trs.assertResult(5 * time.Second) + + // Observe it coming back again + trs.advance(5 * time.Second) + err = trs.Observe(trs.now.Time, cl) + // No error, but no result either + assert.NoError(t, err) + assert.Nil(t, trs.result) +} diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 0868a6c22..82c654faf 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -31,7 +31,7 @@ import ( "go.uber.org/zap" vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" - "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" + "github.com/neondatabase/autoscaling/pkg/agent/core/revsource" "github.com/neondatabase/autoscaling/pkg/api" "github.com/neondatabase/autoscaling/pkg/util" ) @@ -74,6 +74,9 @@ type Config struct { // Log provides an outlet for (*State).NextActions() to give informative messages or warnings // about conditions that are impeding its ability to execute. Log LogConfig `json:"-"` + + // RevisionSource is the source of logical timestamps for the autoscaler-agent. + RevisionSource RevisionSource `json:"-"` } type LogConfig struct { @@ -117,10 +120,8 @@ type state struct { Metrics *SystemMetrics - ClockSource LogicClock `json:"-"` - - // DesiredLogicalTime is the logical time autoscaler-agent currently works to achieve. - DesiredLogicalTime *vmv1.LogicalTime + // TargetRevision is the revision agent works towards. + TargetRevision vmv1.Revision // LastDesiredResources is the last target agent wanted to scale to. LastDesiredResources *api.Resources @@ -138,7 +139,8 @@ type pluginState struct { // nil if we have not been able to contact *any* scheduler. Permit *api.Resources - CurrentLogicalTime *vmv1.LogicalTime + // CurrentRevision is the most recent revision the plugin has acknowledged. + CurrentRevision vmv1.Revision } type pluginRequested struct { @@ -168,7 +170,8 @@ type monitorState struct { // failed UpscaleFailureAt *time.Time - CurrentLogicalTime *vmv1.LogicalTime + // CurrentRevision is the most recent revision the monitor has acknowledged. + CurrentRevision vmv1.Revision } func (ms *monitorState) active() bool { @@ -210,23 +213,23 @@ func (ns *neonvmState) ongoingRequest() bool { return ns.OngoingRequested != nil } -type LogicClock interface { - Next(ts time.Time, kind logiclock.Flag) *vmv1.LogicalTime - Observe(logicalTime *vmv1.LogicalTime) error +type RevisionSource interface { + Next(ts time.Time) vmv1.Revision + Observe(moment time.Time, rev vmv1.Revision) error } -func NewState(vm api.VmInfo, config Config, clockSource LogicClock) *State { +func NewState(vm api.VmInfo, config Config) *State { state := &State{ internal: state{ Config: config, Debug: false, VM: vm, Plugin: pluginState{ - OngoingRequest: false, - LastRequest: nil, - LastFailureAt: nil, - Permit: nil, - CurrentLogicalTime: nil, + OngoingRequest: false, + LastRequest: nil, + LastFailureAt: nil, + Permit: nil, + CurrentRevision: vmv1.ZeroRevision, }, Monitor: monitorState{ OngoingRequest: nil, @@ -235,7 +238,7 @@ func NewState(vm api.VmInfo, config Config, clockSource LogicClock) *State { Approved: nil, DownscaleFailureAt: nil, UpscaleFailureAt: nil, - CurrentLogicalTime: nil, + CurrentRevision: vmv1.ZeroRevision, }, NeonVM: neonvmState{ LastSuccess: nil, @@ -243,9 +246,8 @@ func NewState(vm api.VmInfo, config Config, clockSource LogicClock) *State { RequestFailedAt: nil, }, Metrics: nil, - ClockSource: clockSource, - DesiredLogicalTime: nil, LastDesiredResources: nil, + TargetRevision: vmv1.ZeroRevision, }, } @@ -286,7 +288,7 @@ func (s *state) nextActions(now time.Time) ActionSet { // ---- // Requests to the scheduler plugin: var pluginRequiredWait *time.Duration - actions.PluginRequest, pluginRequiredWait = s.calculatePluginAction(now, desiredResources, s.DesiredLogicalTime) + actions.PluginRequest, pluginRequiredWait = s.calculatePluginAction(now, desiredResources) // ---- // Requests to NeonVM: @@ -300,7 +302,7 @@ func (s *state) nextActions(now time.Time) ActionSet { pluginRequestedPhase = "planned" } var neonvmRequiredWait *time.Duration - actions.NeonVMRequest, neonvmRequiredWait = s.calculateNeonVMAction(now, desiredResources, pluginRequested, pluginRequestedPhase, s.DesiredLogicalTime) + actions.NeonVMRequest, neonvmRequiredWait = s.calculateNeonVMAction(now, desiredResources, pluginRequested, pluginRequestedPhase) // ---- // Requests to vm-monitor (upscaling) @@ -309,7 +311,7 @@ func (s *state) nextActions(now time.Time) ActionSet { // forego notifying the vm-monitor of increased resources because we were busy asking if it // could downscale. var monitorUpscaleRequiredWait *time.Duration - actions.MonitorUpscale, monitorUpscaleRequiredWait = s.calculateMonitorUpscaleAction(now, desiredResources, s.DesiredLogicalTime) + actions.MonitorUpscale, monitorUpscaleRequiredWait = s.calculateMonitorUpscaleAction(now, desiredResources) // ---- // Requests to vm-monitor (downscaling) @@ -320,7 +322,6 @@ func (s *state) nextActions(now time.Time) ActionSet { now, desiredResources, plannedUpscale, - s.DesiredLogicalTime, ) // --- and that's all the request types! --- @@ -354,7 +355,6 @@ func (s *state) nextActions(now time.Time) ActionSet { func (s *state) calculatePluginAction( now time.Time, desiredResources api.Resources, - desiredLogicalTime *vmv1.LogicalTime, ) (*ActionPluginRequest, *time.Duration) { logFailureReason := func(reason string) { s.warnf("Wanted to make a request to the scheduler plugin, but %s", reason) @@ -442,7 +442,7 @@ func (s *state) calculatePluginAction( return nil } }(), - DesiredLogicalTime: desiredLogicalTime.Rewind(now), + TargetRevision: s.TargetRevision.WithTime(now), }, nil } else { if wantToRequestNewResources && waitingOnRetryBackoff { @@ -463,16 +463,16 @@ func (s *state) calculateNeonVMAction( desiredResources api.Resources, pluginRequested *api.Resources, pluginRequestedPhase string, - desiredTime *vmv1.LogicalTime, ) (*ActionNeonVMRequest, *time.Duration) { + targetRevision := s.TargetRevision if desiredResources.HasFieldLessThan(s.VM.Using()) { // We are downscaling, so we needed a permit from the monitor - desiredTime = desiredTime.Earliest(s.Monitor.CurrentLogicalTime) + targetRevision = targetRevision.Min(s.Monitor.CurrentRevision) } if desiredResources.HasFieldGreaterThan(s.VM.Using()) { // We are upscaling, so we needed a permit from the plugin - desiredTime = desiredTime.Earliest(s.Plugin.CurrentLogicalTime) + targetRevision = targetRevision.Min(s.Plugin.CurrentRevision) } // clamp desiredResources to what we're allowed to make a request for @@ -502,9 +502,9 @@ func (s *state) calculateNeonVMAction( } return &ActionNeonVMRequest{ - Current: s.VM.Using(), - Target: desiredResources, - DesiredLogicalTime: desiredTime.Rewind(now), + Current: s.VM.Using(), + Target: desiredResources, + TargetRevision: targetRevision.WithTime(now), }, nil } else { var reqs []string @@ -526,7 +526,6 @@ func (s *state) calculateNeonVMAction( func (s *state) calculateMonitorUpscaleAction( now time.Time, desiredResources api.Resources, - desiredLogicalTime *vmv1.LogicalTime, ) (*ActionMonitorUpscale, *time.Duration) { // can't do anything if we don't have an active connection to the vm-monitor if !s.Monitor.active() { @@ -588,9 +587,9 @@ func (s *state) calculateMonitorUpscaleAction( // Otherwise, we can make the request: return &ActionMonitorUpscale{ - Current: *s.Monitor.Approved, - Target: requestResources, - DesiredLogicalTime: desiredLogicalTime.Rewind(now), + Current: *s.Monitor.Approved, + Target: requestResources, + TargetRevision: s.TargetRevision.WithTime(now), }, nil } @@ -598,7 +597,6 @@ func (s *state) calculateMonitorDownscaleAction( now time.Time, desiredResources api.Resources, plannedUpscaleRequest bool, - desiredLogicalTime *vmv1.LogicalTime, ) (*ActionMonitorDownscale, *time.Duration) { // can't do anything if we don't have an active connection to the vm-monitor if !s.Monitor.active() { @@ -676,9 +674,9 @@ func (s *state) calculateMonitorDownscaleAction( // Nothing else to check, we're good to make the request return &ActionMonitorDownscale{ - Current: *s.Monitor.Approved, - Target: requestResources, - DesiredLogicalTime: desiredLogicalTime.Rewind(now), + Current: *s.Monitor.Approved, + Target: requestResources, + TargetRevision: s.TargetRevision.WithTime(now), }, nil } @@ -841,38 +839,27 @@ func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) ( return nil } } - s.updateDesiredLogicalTime(now, result, s.VM.Using(), requestedUpscalingAffectedResult) + s.updateTargetRevision(now, result, s.VM.Using(), requestedUpscalingAffectedResult) s.LastDesiredResources = &result s.info("Calculated desired resources", zap.Object("current", s.VM.Using()), zap.Object("target", result), - zap.Object("desiredLogicalTime", s.DesiredLogicalTime)) + zap.Object("targetRevision", &s.TargetRevision)) return result, calculateWaitTime } -func (s *state) updateDesiredLogicalTime( +func (s *state) updateTargetRevision( now time.Time, desired api.Resources, current api.Resources, immediate bool, ) { - var flags logiclock.Flag - if desired.HasFieldGreaterThan(current) { - flags.Set(logiclock.Upscale) - } - if desired.HasFieldLessThan(current) { - flags.Set(logiclock.Downscale) - } - if immediate { - flags.Set(logiclock.Immediate) - } - if s.LastDesiredResources == nil { if desired == current { - // First iteration, but no scaling required + // First iteration, and no scaling required return } } else { @@ -882,11 +869,21 @@ func (s *state) updateDesiredLogicalTime( } } - s.DesiredLogicalTime = s.ClockSource.Next(now, flags) + s.TargetRevision = s.Config.RevisionSource.Next(now) + + if desired.HasFieldGreaterThan(current) { + s.TargetRevision.Flags.Set(revsource.Upscale) + } + if desired.HasFieldLessThan(current) { + s.TargetRevision.Flags.Set(revsource.Downscale) + } + if immediate { + s.TargetRevision.Flags.Set(revsource.Immediate) + } } -func (s *state) updateLogicalTime(logicalTime *vmv1.LogicalTime) { - err := s.ClockSource.Observe(logicalTime) +func (s *state) updateCurrentRevision(rev vmv1.RevisionWithTime) { + err := s.Config.RevisionSource.Observe(rev.UpdatedAt.Time, rev.Revision) if err != nil { s.warnf("Failed to observe clock source: %v", err) } @@ -1027,7 +1024,9 @@ func (s *State) UpdatedVM(vm api.VmInfo) { // - https://github.com/neondatabase/autoscaling/issues/462 vm.SetUsing(s.internal.VM.Using()) s.internal.VM = vm - s.internal.updateLogicalTime(vm.CurrentLogicalTime) + if vm.CurrentRevision != nil { + s.internal.updateCurrentRevision(*vm.CurrentRevision) + } } func (s *State) UpdateSystemMetrics(metrics SystemMetrics) { @@ -1062,7 +1061,7 @@ func (h PluginHandle) RequestFailed(now time.Time) { func (h PluginHandle) RequestSuccessful( now time.Time, - desiredTime *vmv1.LogicalTime, + rev vmv1.RevisionWithTime, resp api.PluginResponse, ) (_err error) { h.s.Plugin.OngoingRequest = false @@ -1096,7 +1095,7 @@ func (h PluginHandle) RequestSuccessful( // the process of moving the source of truth for ComputeUnit from the scheduler plugin to the // autoscaler-agent. h.s.Plugin.Permit = &resp.Permit - h.s.Plugin.CurrentLogicalTime = desiredTime.Rewind(now) + h.s.Plugin.CurrentRevision = rev.Revision return nil } @@ -1117,7 +1116,7 @@ func (h MonitorHandle) Reset() { Approved: nil, DownscaleFailureAt: nil, UpscaleFailureAt: nil, - CurrentLogicalTime: nil, + CurrentRevision: vmv1.ZeroRevision, } } @@ -1164,23 +1163,21 @@ func (h MonitorHandle) StartingDownscaleRequest(now time.Time, resources api.Res h.s.Monitor.DownscaleFailureAt = nil } -func (h MonitorHandle) DownscaleRequestAllowed(now time.Time) { +func (h MonitorHandle) DownscaleRequestAllowed(now time.Time, rev vmv1.RevisionWithTime) { h.s.Monitor.Approved = &h.s.Monitor.OngoingRequest.Requested h.s.Monitor.OngoingRequest = nil -} - -func (h MonitorHandle) UpdateLogicalTime(currentTime *vmv1.LogicalTime) { - h.s.Monitor.CurrentLogicalTime = currentTime + h.s.Monitor.CurrentRevision = rev.Revision } // Downscale request was successful but the monitor denied our request. -func (h MonitorHandle) DownscaleRequestDenied(now time.Time) { +func (h MonitorHandle) DownscaleRequestDenied(now time.Time, rev vmv1.RevisionWithTime) { h.s.Monitor.DeniedDownscale = &deniedDownscale{ At: now, Current: *h.s.Monitor.Approved, Requested: h.s.Monitor.OngoingRequest.Requested, } h.s.Monitor.OngoingRequest = nil + h.s.Monitor.CurrentRevision = rev.Revision } func (h MonitorHandle) DownscaleRequestFailed(now time.Time) { diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index f019e4c54..56bdf908a 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -2,6 +2,7 @@ package core_test import ( "fmt" + "github.com/stretchr/testify/require" "testing" "time" @@ -10,17 +11,13 @@ import ( "go.uber.org/zap" "golang.org/x/exp/slices" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "github.com/neondatabase/autoscaling/pkg/agent/core" - "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" + "github.com/neondatabase/autoscaling/pkg/agent/core/revsource" helpers "github.com/neondatabase/autoscaling/pkg/agent/core/testhelpers" "github.com/neondatabase/autoscaling/pkg/api" ) -var NilLogicalTime = helpers.SafeVal[vmv1.LogicalTime](nil) - func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { slotSize := api.Bytes(1 << 30 /* 1 Gi */) @@ -88,9 +85,6 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { for _, c := range cases { warnings := []string{} - - source := logiclock.NewClock(nil) - state := core.NewState( api.VmInfo{ Name: "test", @@ -113,7 +107,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { ScalingEnabled: true, ScalingConfig: nil, }, - CurrentLogicalTime: nil, + CurrentRevision: nil, }, core.Config{ ComputeUnit: api.Resources{VCPU: 250, Mem: 1 * slotSize}, @@ -136,8 +130,8 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { warnings = append(warnings, msg) }, }, + RevisionSource: revsource.NewRevisionSource(nil), }, - source, ) t.Run(c.name, func(t *testing.T) { @@ -148,7 +142,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { // set lastApproved by simulating a scheduler request/response state.Plugin().StartingRequest(now, c.schedulerApproved) - err := state.Plugin().RequestSuccessful(now, nil, api.PluginResponse{ + err := state.Plugin().RequestSuccessful(now, vmv1.ZeroRevision.WithTime(now), api.PluginResponse{ Permit: c.schedulerApproved, Migrate: nil, }) @@ -162,7 +156,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { state.Monitor().Reset() state.Monitor().Active(true) state.Monitor().StartingDownscaleRequest(now, *c.deniedDownscale) - state.Monitor().DownscaleRequestDenied(now) + state.Monitor().DownscaleRequestDenied(now, vmv1.ZeroRevision.WithTime(now)) } actual, _ := state.DesiredResourcesFromMetricsOrRequestedUpscaling(now) @@ -205,6 +199,7 @@ var DefaultInitialStateConfig = helpers.InitialStateConfig{ Info: nil, Warn: nil, }, + RevisionSource: &revsource.NilRevisionSource{}, }, } @@ -213,25 +208,19 @@ func getDesiredResources(state *core.State, now time.Time) api.Resources { return res } -func logicalTime(clock *helpers.FakeClock, value int64) *vmv1.LogicalTime { - return &vmv1.LogicalTime{ - Value: value, - UpdatedAt: v1.NewTime(clock.Now()), - } -} - func doInitialPluginRequest(a helpers.Assert, state *core.State, clock *helpers.FakeClock, requestTime time.Duration, metrics *api.Metrics, resources api.Resources) { + rev := vmv1.ZeroRevision a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: nil, - Target: resources, - Metrics: metrics, - DesiredLogicalTime: nil, + LastPermit: nil, + Target: resources, + Metrics: metrics, + TargetRevision: rev.WithTime(clock.Now()), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resources) clock.Inc(requestTime) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), rev.WithTime(clock.Now()), api.PluginResponse{ Permit: resources, Migrate: nil, }) @@ -257,21 +246,23 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { var latencyObservations []struct { latency time.Duration - flags logiclock.Flag + flags vmv1.Flag } - logicClock := logiclock.NewClock(func(latency time.Duration, flags logiclock.Flag) { + expectedRevision := vmv1.ZeroRevision + cfg := DefaultInitialStateConfig + + cfg.Core.RevisionSource = revsource.NewRevisionSource(func(latency time.Duration, flags vmv1.Flag) { latencyObservations = append(latencyObservations, struct { latency time.Duration - flags logiclock.Flag + flags vmv1.Flag }{latency, flags}) }) state := helpers.CreateInitialState( - DefaultInitialStateConfig, + cfg, helpers.WithStoredWarnings(a.StoredWarnings()), helpers.WithTestingLogfWarnings(t), - helpers.WithClock(logicClock), ) nextActions := func() core.ActionSet { return state.NextActions(clock.Now()) @@ -294,15 +285,17 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Equals(resForCU(2)) // Now that the initial scheduler request is done, and we have metrics that indicate - // scale-up would be a good idea. Logical time nil -> 0. - lt := logicalTime(clock, 0) + // scale-up would be a good idea. Revision advances. + expectedRevision.Value = 1 + expectedRevision.Flags = revsource.Upscale + // We should be contacting the scheduler to get approval. a.Call(nextActions).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(1)), - Target: resForCU(2), - Metrics: lo.ToPtr(lastMetrics.ToAPI()), - DesiredLogicalTime: lt, + LastPermit: lo.ToPtr(resForCU(1)), + Target: resForCU(2), + Metrics: lo.ToPtr(lastMetrics.ToAPI()), + TargetRevision: expectedRevision.WithTime(clock.Now()), }, }) // start the request: @@ -310,11 +303,10 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { clockTick().AssertEquals(duration("0.3s")) // should have nothing more to do; waiting on plugin request to come back a.Call(nextActions).Equals(core.ActionSet{}) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), lt, api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(clock.Now()), api.PluginResponse{ Permit: resForCU(2), Migrate: nil, }) - lt = lt.Rewind(clock.Now()) // Scheduler approval is done, now we should be making the request to NeonVM a.Call(nextActions).Equals(core.ActionSet{ @@ -323,9 +315,9 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { // the next scheduler request. Wait: &core.ActionWait{Duration: duration("4.9s")}, NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(1), - Target: resForCU(2), - DesiredLogicalTime: lt, + Current: resForCU(1), + Target: resForCU(2), + TargetRevision: expectedRevision.WithTime(clock.Now()), }, }) // start the request: @@ -340,26 +332,27 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { assert.Empty(t, latencyObservations) // Now NeonVM request is done. - lt = lt.Rewind(clock.Now()) a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - a.Do(state.UpdatedVM, helpers.CreateVmInfo( + vmInfo := helpers.CreateVmInfo( DefaultInitialStateConfig.VM, - helpers.WithLogicalTime(lt), - )) + ) + rev := expectedRevision.WithTime(clock.Now()) + vmInfo.CurrentRevision = &rev + a.Do(state.UpdatedVM, vmInfo) // And we see the latency - assert.Len(t, latencyObservations, 1) + require.Len(t, latencyObservations, 1) // We started at 0.2s and finished at 0.4s assert.Equal(t, duration("0.2s"), latencyObservations[0].latency) - assert.Equal(t, logiclock.Upscale, latencyObservations[0].flags) + assert.Equal(t, revsource.Upscale, latencyObservations[0].flags) // NeonVM change is done, now we should finish by notifying the vm-monitor a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.8s")}, // same as previous, clock hasn't changed MonitorUpscale: &core.ActionMonitorUpscale{ - Current: resForCU(1), - Target: resForCU(2), - DesiredLogicalTime: lt, + Current: resForCU(1), + Target: resForCU(2), + TargetRevision: expectedRevision.WithTime(clock.Now()), }, }) // start the request: @@ -370,7 +363,6 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Wait: &core.ActionWait{Duration: duration("4.7s")}, }) a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) - state.Monitor().UpdateLogicalTime(lt.Rewind(clock.Now())) // And now, double-check that there's no sneaky follow-up actions before we change the // metrics @@ -382,6 +374,9 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { clockTick().AssertEquals(duration("0.6s")) + expectedRevision.Value += 1 + expectedRevision.Flags = revsource.Downscale + // Set metrics back so that desired resources should now be zero lastMetrics = core.SystemMetrics{ LoadAverage1Min: 0.0, @@ -392,34 +387,30 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { a.Call(getDesiredResources, state, clock.Now()). Equals(resForCU(1)) - lt = logicalTime(clock, 1) - // First step in downscaling is getting approval from the vm-monitor: a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.6s")}, MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(2), - Target: resForCU(1), - DesiredLogicalTime: lt, + Current: resForCU(2), + Target: resForCU(1), + TargetRevision: expectedRevision.WithTime(clock.Now()), }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) clockTick().AssertEquals(duration("0.7s")) - lt = lt.Rewind(clock.Now()) // should have nothing more to do; waiting on vm-monitor request to come back a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.5s")}, }) - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) - state.Monitor().UpdateLogicalTime(lt) + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), expectedRevision.WithTime(clock.Now())) // After getting approval from the vm-monitor, we make the request to NeonVM to carry it out a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.5s")}, // same as previous, clock hasn't changed NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(2), - Target: resForCU(1), - DesiredLogicalTime: lt, + Current: resForCU(2), + Target: resForCU(1), + TargetRevision: expectedRevision.WithTime(clock.Now()), }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) @@ -432,27 +423,28 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { // Update the VM to set current=1, but first wait 0.1s clockTick().AssertEquals(duration("0.9s")) - lt = lt.Rewind(clock.Now()) - a.Do(state.UpdatedVM, helpers.CreateVmInfo( + vmInfo = helpers.CreateVmInfo( DefaultInitialStateConfig.VM, helpers.WithCurrentCU(1), helpers.WithMinMaxCU(1, 1), - helpers.WithLogicalTime(lt), - )) + ) + rev = expectedRevision.WithTime(clock.Now()) + vmInfo.CurrentRevision = &rev + a.Do(state.UpdatedVM, vmInfo) // One more latency observation - assert.Len(t, latencyObservations, 2) + require.Len(t, latencyObservations, 2) // We started at 0.6s and finished at 0.9s assert.Equal(t, duration("0.3s"), latencyObservations[1].latency) - assert.Equal(t, logiclock.Downscale, latencyObservations[1].flags) + assert.Equal(t, revsource.Downscale, latencyObservations[1].flags) // Request to NeonVM completed, it's time to inform the scheduler plugin: a.Call(nextActions).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(2)), - Target: resForCU(1), - Metrics: lo.ToPtr(lastMetrics.ToAPI()), - DesiredLogicalTime: lt, + LastPermit: lo.ToPtr(resForCU(2)), + Target: resForCU(1), + Metrics: lo.ToPtr(lastMetrics.ToAPI()), + TargetRevision: expectedRevision.WithTime(clock.Now()), }, // shouldn't have anything to say to the other components }) @@ -460,7 +452,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { clockTick().AssertEquals(duration("1s")) // should have nothing more to do; waiting on plugin request to come back a.Call(nextActions).Equals(core.ActionSet{}) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), lt, api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(clock.Now()), api.PluginResponse{ Permit: resForCU(1), Migrate: nil, }) @@ -471,1154 +463,1155 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { }) } -// Test that in a stable state, requests to the plugin happen exactly every Config.PluginRequestTick -func TestPeriodicPluginRequest(t *testing.T) { - a := helpers.NewAssert(t) - clock := helpers.NewFakeClock(t) - - state := helpers.CreateInitialState( - DefaultInitialStateConfig, - helpers.WithStoredWarnings(a.StoredWarnings()), - ) - - state.Monitor().Active(true) - - metrics := core.SystemMetrics{ - LoadAverage1Min: 0.0, - MemoryUsageBytes: 0.0, - } - resources := DefaultComputeUnit - - a.Do(state.UpdateSystemMetrics, metrics) - - base := duration("0s") - clock.Elapsed().AssertEquals(base) - - clockTick := duration("100ms") - reqDuration := duration("50ms") - reqEvery := DefaultInitialStateConfig.Core.PluginRequestTick - endTime := duration("20s") - - doInitialPluginRequest(a, state, clock, clockTick, lo.ToPtr(metrics.ToAPI()), resources) - - for clock.Elapsed().Duration < endTime { - timeSinceScheduledRequest := (clock.Elapsed().Duration - base) % reqEvery - - if timeSinceScheduledRequest != 0 { - timeUntilNextRequest := reqEvery - timeSinceScheduledRequest - a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: timeUntilNextRequest}, - }) - clock.Inc(clockTick) - } else { - a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ - PluginRequest: &core.ActionPluginRequest{ - LastPermit: &resources, - Target: resources, - Metrics: lo.ToPtr(metrics.ToAPI()), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resources) - a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{}) - clock.Inc(reqDuration) - a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{}) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resources, - Migrate: nil, - }) - clock.Inc(clockTick - reqDuration) - } - } -} - -// Checks that when downscaling is denied, we both (a) try again with higher resources, or (b) wait -// to retry if there aren't higher resources to try with. -func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { - a := helpers.NewAssert(t) - clock := helpers.NewFakeClock(t) - clockTickDuration := duration("0.1s") - clockTick := func() { - clock.Inc(clockTickDuration) - } - resForCU := DefaultComputeUnit.Mul - - state := helpers.CreateInitialState( - DefaultInitialStateConfig, - helpers.WithStoredWarnings(a.StoredWarnings()), - helpers.WithMinMaxCU(1, 8), - helpers.WithCurrentCU(6), // NOTE: Start at 6 CU, so we're trying to scale down immediately. - helpers.WithConfigSetting(func(c *core.Config) { - // values close to the default, so request timing works out a little better. - c.PluginRequestTick = duration("7s") - c.MonitorDeniedDownscaleCooldown = duration("4s") - }), - ) - - nextActions := func() core.ActionSet { - return state.NextActions(clock.Now()) - } - - state.Monitor().Active(true) - - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(6)) - - // Set metrics - clockTick() - metrics := core.SystemMetrics{ - LoadAverage1Min: 0.0, - MemoryUsageBytes: 0.0, - } - a.Do(state.UpdateSystemMetrics, metrics) - // double-check that we agree about the desired resources - a.Call(getDesiredResources, state, clock.Now()). - Equals(resForCU(1)) - - // Broadly the idea here is that we should be trying to request downscaling from the vm-monitor, - // and retrying with progressively higher values until either we get approved, or we run out of - // options, at which point we should wait until later to re-request downscaling. - // - // This behavior results in linear retry passes. - // - // For this test, we: - // 1. Deny any request in the first pass - // 2. Approve only down to 3 CU on the second pass - // a. triggers NeonVM request - // b. triggers plugin request - // 3. Deny all requests in the third pass (i.e. stay at 3 CU) - // 4. Approve down to 1 CU on the fourth pass - // a. triggers NeonVM request - // b. triggers plugin request - // - // ---- - // - // First pass: deny downscaling. - clock.Elapsed() - - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("6.8s")}, - MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(6), - Target: resForCU(5), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(5)) - clockTick() - a.Do(state.Monitor().DownscaleRequestDenied, clock.Now()) - - // At the end, we should be waiting to retry downscaling: - a.Call(nextActions).Equals(core.ActionSet{ - // Taken from DefaultInitialStateConfig.Core.MonitorDeniedDownscaleCooldown - Wait: &core.ActionWait{Duration: duration("4.0s")}, - }) - - clock.Inc(duration("4s")) - currentPluginWait := duration("2.7s") - - // Second pass: Approve only down to 3 CU, then NeonVM & plugin requests. - for cu := uint16(5); cu >= 2; cu -= 1 { - var expectedNeonVMRequest *core.ActionNeonVMRequest - if cu < 5 { - expectedNeonVMRequest = &core.ActionNeonVMRequest{ - Current: resForCU(6), - Target: resForCU(cu + 1), - DesiredLogicalTime: nil, - } - } - - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: currentPluginWait}, - MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(cu + 1), - Target: resForCU(cu), - DesiredLogicalTime: nil, - }, - NeonVMRequest: expectedNeonVMRequest, - }) - a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(cu)) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: currentPluginWait}, - NeonVMRequest: expectedNeonVMRequest, - }) - clockTick() - currentPluginWait -= clockTickDuration - if cu >= 3 /* allow down to 3 */ { - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) - } else { - a.Do(state.Monitor().DownscaleRequestDenied, clock.Now()) - } - } - // At this point, waiting 3.7s for next attempt to downscale below 3 CU (last request was - // successful, but the one before it wasn't), and 0.8s for plugin tick. - // Also, because downscaling was approved, we should want to make a NeonVM request to do that. - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("2.3s")}, - NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(6), - Target: resForCU(3), - DesiredLogicalTime: nil, - }, - }) - // Make the request: - a.Do(state.NeonVM().StartingRequest, time.Now(), resForCU(3)) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("2.3s")}, - }) - clockTick() - a.Do(state.NeonVM().RequestSuccessful, time.Now()) - // Successfully scaled down, so we should now inform the plugin. But also, we'll want to retry - // the downscale request to vm-monitor once the retry is up: - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("3.9s")}, - PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(6)), - Target: resForCU(3), - Metrics: lo.ToPtr(metrics.ToAPI()), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("3.9s")}, - }) - clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resForCU(3), - Migrate: nil, - }) - // ... And *now* there's nothing left to do but wait until downscale wait expires: - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("3.8s")}, - }) - - // so, wait for that: - clock.Inc(duration("3.8s")) - - // Third pass: deny requested downscaling. - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("3.1s")}, - MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(3), - Target: resForCU(2), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(2)) - clockTick() - a.Do(state.Monitor().DownscaleRequestDenied, clock.Now()) - // At the end, we should be waiting to retry downscaling (but actually, the regular plugin - // request is coming up sooner). - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("3.0s")}, - }) - // ... so, wait for that plugin request/response, and then wait to retry downscaling: - clock.Inc(duration("3s")) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("1s")}, // still want to retry vm-monitor downscaling - PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(3)), - Target: resForCU(3), - Metrics: lo.ToPtr(metrics.ToAPI()), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("1s")}, // still waiting on retrying vm-monitor downscaling - }) - clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resForCU(3), - Migrate: nil, - }) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("0.9s")}, // yep, still waiting on retrying vm-monitor downscaling - }) - - clock.Inc(duration("0.9s")) - - // Fourth pass: approve down to 1 CU - wait to do the NeonVM requests until the end - currentPluginWait = duration("6.0s") - for cu := uint16(2); cu >= 1; cu -= 1 { - var expectedNeonVMRequest *core.ActionNeonVMRequest - if cu < 2 { - expectedNeonVMRequest = &core.ActionNeonVMRequest{ - Current: resForCU(3), - Target: resForCU(cu + 1), - DesiredLogicalTime: nil, - } - } - - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: currentPluginWait}, - MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(cu + 1), - Target: resForCU(cu), - DesiredLogicalTime: nil, - }, - NeonVMRequest: expectedNeonVMRequest, - }) - a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(cu)) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: currentPluginWait}, - NeonVMRequest: expectedNeonVMRequest, - }) - clockTick() - currentPluginWait -= clockTickDuration - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) - } - // Still waiting on plugin request tick, but we can make a NeonVM request to enact the - // downscaling right away ! - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("5.8s")}, - NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(3), - Target: resForCU(1), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.NeonVM().StartingRequest, time.Now(), resForCU(1)) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("5.8s")}, // yep, still waiting on the plugin - }) - clockTick() - a.Do(state.NeonVM().RequestSuccessful, time.Now()) - // Successfully downscaled, so now we should inform the plugin. Not waiting on any retries. - a.Call(nextActions).Equals(core.ActionSet{ - PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(3)), - Target: resForCU(1), - Metrics: lo.ToPtr(metrics.ToAPI()), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) - a.Call(nextActions).Equals(core.ActionSet{ - // not waiting on anything! - }) - clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resForCU(1), - Migrate: nil, - }) - // And now there's truly nothing left to do. Back to waiting on plugin request tick :) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("6.9s")}, - }) -} - -// Checks that we scale up in a timely manner when the vm-monitor requests it, and don't request -// downscaling until the time expires. -func TestRequestedUpscale(t *testing.T) { - a := helpers.NewAssert(t) - clock := helpers.NewFakeClock(t) - clockTick := func() { - clock.Inc(100 * time.Millisecond) - } - resForCU := DefaultComputeUnit.Mul - - state := helpers.CreateInitialState( - DefaultInitialStateConfig, - helpers.WithStoredWarnings(a.StoredWarnings()), - helpers.WithConfigSetting(func(c *core.Config) { - c.MonitorRequestedUpscaleValidPeriod = duration("6s") // Override this for consistency - }), - ) - nextActions := func() core.ActionSet { - return state.NextActions(clock.Now()) - } - - state.Monitor().Active(true) - - // Send initial scheduler request: - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) - - // Set metrics - clockTick() - lastMetrics := core.SystemMetrics{ - LoadAverage1Min: 0.0, - MemoryUsageBytes: 0.0, - } - a.Do(state.UpdateSystemMetrics, lastMetrics) - - // Check we're not supposed to do anything - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.8s")}, - }) - - // Have the vm-monitor request upscaling: - a.Do(state.Monitor().UpscaleRequested, clock.Now(), api.MoreResources{Cpu: false, Memory: true}) - // First need to check with the scheduler plugin to get approval for upscaling: - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("6s")}, // if nothing else happens, requested upscale expires. - PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(1)), - Target: resForCU(2), - Metrics: lo.ToPtr(lastMetrics.ToAPI()), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) - clockTick() - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("5.9s")}, // same waiting for requested upscale expiring - }) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resForCU(2), - Migrate: nil, - }) - - // After approval from the scheduler plugin, now need to make NeonVM request: - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin tick wait is earlier than requested upscale expiration - NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(1), - Target: resForCU(2), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) - clockTick() - a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - - // Finally, tell the vm-monitor that it got upscaled: - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.8s")}, // still waiting on plugin tick - MonitorUpscale: &core.ActionMonitorUpscale{ - Current: resForCU(1), - Target: resForCU(2), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) - clockTick() - a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) - - // After everything, we should be waiting on both: - // (a) scheduler plugin tick (4.7s remaining), and - // (b) vm-monitor requested upscaling expiring (5.7s remaining) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.7s")}, - }) - - // Do the routine scheduler plugin request. Still waiting 1s for vm-monitor request expiration - clock.Inc(duration("4.7s")) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("1s")}, - PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(2)), - Target: resForCU(2), - Metrics: lo.ToPtr(lastMetrics.ToAPI()), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) - clockTick() - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("0.9s")}, // waiting for requested upscale expiring - }) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resForCU(2), - Migrate: nil, - }) - - // Still should just be waiting on vm-monitor upscale expiring - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("0.9s")}, - }) - clock.Inc(duration("0.9s")) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4s")}, // now, waiting on plugin request tick - MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(2), - Target: resForCU(1), - DesiredLogicalTime: nil, - }, - }) -} - -// Checks that if we get new metrics partway through downscaling, then we pivot back to upscaling -// without further requests in furtherance of downscaling. // -// For example, if we pivot during the NeonVM request to do the downscaling, then the request to to -// the scheduler plugin should never be made, because we decided against downscaling. -func TestDownscalePivotBack(t *testing.T) { - a := helpers.NewAssert(t) - var clock *helpers.FakeClock - - clockTickDuration := duration("0.1s") - clockTick := func() helpers.Elapsed { - return clock.Inc(clockTickDuration) - } - halfClockTick := func() helpers.Elapsed { - return clock.Inc(clockTickDuration / 2) - } - resForCU := DefaultComputeUnit.Mul - - var state *core.State - nextActions := func() core.ActionSet { - return state.NextActions(clock.Now()) - } - - initialMetrics := core.SystemMetrics{ - LoadAverage1Min: 0.0, - MemoryUsageBytes: 0.0, - } - newMetrics := core.SystemMetrics{ - LoadAverage1Min: 0.3, - MemoryUsageBytes: 0.0, - } - - steps := []struct { - pre func(pluginWait *time.Duration, midRequest func()) - post func(pluginWait *time.Duration) - }{ - // vm-monitor requests: - { - pre: func(pluginWait *time.Duration, midRequest func()) { - t.Log(" > start vm-monitor downscale") - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: *pluginWait}, - MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(2), - Target: resForCU(1), - - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) - halfClockTick() - midRequest() - halfClockTick() - *pluginWait -= clockTickDuration - t.Log(" > finish vm-monitor downscale") - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) - }, - post: func(pluginWait *time.Duration) { - t.Log(" > start vm-monitor upscale") - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: *pluginWait}, - MonitorUpscale: &core.ActionMonitorUpscale{ - Current: resForCU(1), - Target: resForCU(2), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) - clockTick() - *pluginWait -= clockTickDuration - t.Log(" > finish vm-monitor upscale") - a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) - }, - }, - // NeonVM requests - { - pre: func(pluginWait *time.Duration, midRequest func()) { - t.Log(" > start NeonVM downscale") - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: *pluginWait}, - NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(2), - Target: resForCU(1), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) - halfClockTick() - midRequest() - halfClockTick() - *pluginWait -= clockTickDuration - t.Log(" > finish NeonVM downscale") - a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - }, - post: func(pluginWait *time.Duration) { - t.Log(" > start NeonVM upscale") - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: *pluginWait}, - NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(1), - Target: resForCU(2), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) - clockTick() - *pluginWait -= clockTickDuration - t.Log(" > finish NeonVM upscale") - a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - }, - }, - // plugin requests - { - pre: func(pluginWait *time.Duration, midRequest func()) { - t.Log(" > start plugin downscale") - a.Call(nextActions).Equals(core.ActionSet{ - PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(2)), - Target: resForCU(1), - Metrics: lo.ToPtr(initialMetrics.ToAPI()), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) - halfClockTick() - midRequest() - halfClockTick() - *pluginWait = duration("4.9s") // reset because we just made a request - t.Log(" > finish plugin downscale") - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resForCU(1), - Migrate: nil, - }) - }, - post: func(pluginWait *time.Duration) { - t.Log(" > start plugin upscale") - a.Call(nextActions).Equals(core.ActionSet{ - PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(1)), - Target: resForCU(2), - Metrics: lo.ToPtr(newMetrics.ToAPI()), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) - clockTick() - *pluginWait = duration("4.9s") // reset because we just made a request - t.Log(" > finish plugin upscale") - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resForCU(2), - Migrate: nil, - }) - }, - }, - } - - for i := 0; i < len(steps); i++ { - t.Logf("iter(%d)", i) - - // Initial setup - clock = helpers.NewFakeClock(t) - state = helpers.CreateInitialState( - DefaultInitialStateConfig, - helpers.WithStoredWarnings(a.StoredWarnings()), - helpers.WithMinMaxCU(1, 3), - helpers.WithCurrentCU(2), - ) - - state.Monitor().Active(true) - - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) - - clockTick().AssertEquals(duration("0.2s")) - pluginWait := duration("4.8s") - - a.Do(state.UpdateSystemMetrics, initialMetrics) - // double-check that we agree about the desired resources - a.Call(getDesiredResources, state, clock.Now()). - Equals(resForCU(1)) - - for j := 0; j <= i; j++ { - midRequest := func() {} - if j == i { - // at the midpoint, start backtracking by setting the metrics - midRequest = func() { - t.Log(" > > updating metrics mid-request") - a.Do(state.UpdateSystemMetrics, newMetrics) - a.Call(getDesiredResources, state, clock.Now()). - Equals(resForCU(2)) - } - } - - steps[j].pre(&pluginWait, midRequest) - } - - for j := i; j >= 0; j-- { - steps[j].post(&pluginWait) - } - } -} - -// Checks that if the VM's min/max bounds change so that the maximum is below the current and -// desired usage, we try to downscale -func TestBoundsChangeRequiresDownsale(t *testing.T) { - a := helpers.NewAssert(t) - clock := helpers.NewFakeClock(t) - clockTick := func() { - clock.Inc(100 * time.Millisecond) - } - resForCU := DefaultComputeUnit.Mul - - state := helpers.CreateInitialState( - DefaultInitialStateConfig, - helpers.WithStoredWarnings(a.StoredWarnings()), - helpers.WithMinMaxCU(1, 3), - helpers.WithCurrentCU(2), - ) - nextActions := func() core.ActionSet { - return state.NextActions(clock.Now()) - } - - state.Monitor().Active(true) - - // Send initial scheduler request: - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) - - clockTick() - - // Set metrics so the desired resources are still 2 CU - metrics := core.SystemMetrics{ - LoadAverage1Min: 0.3, - MemoryUsageBytes: 0.0, - } - a.Do(state.UpdateSystemMetrics, metrics) - // Check that we agree about desired resources - a.Call(getDesiredResources, state, clock.Now()). - Equals(resForCU(2)) - // Check we've got nothing to do yet - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.8s")}, - }) - - clockTick() - - // Update the VM to set min=max=1 CU - a.Do(state.UpdatedVM, helpers.CreateVmInfo( - DefaultInitialStateConfig.VM, - helpers.WithCurrentCU(2), - helpers.WithMinMaxCU(1, 1), - )) - - // We should be making a vm-monitor downscaling request - // TODO: In the future, we should have a "force-downscale" alternative so the vm-monitor doesn't - // get to deny the downscaling. - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.7s")}, - MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(2), - Target: resForCU(1), - - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) - clockTick() - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) - // Do NeonVM request for that downscaling - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.6s")}, - NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(2), - Target: resForCU(1), - - DesiredLogicalTime: nil, - }, - }) - a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) - clockTick() - a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - // Do plugin request for that downscaling: - a.Call(nextActions).Equals(core.ActionSet{ - PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(2)), - Target: resForCU(1), - Metrics: lo.ToPtr(metrics.ToAPI()), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) - clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resForCU(1), - Migrate: nil, - }) - // And then, we shouldn't need to do anything else: - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.9s")}, - }) -} - -// Checks that if the VM's min/max bounds change so that the minimum is above the current and -// desired usage, we try to upscale -func TestBoundsChangeRequiresUpscale(t *testing.T) { - a := helpers.NewAssert(t) - clock := helpers.NewFakeClock(t) - clockTick := func() { - clock.Inc(100 * time.Millisecond) - } - resForCU := DefaultComputeUnit.Mul - - state := helpers.CreateInitialState( - DefaultInitialStateConfig, - helpers.WithStoredWarnings(a.StoredWarnings()), - helpers.WithMinMaxCU(1, 3), - helpers.WithCurrentCU(2), - ) - nextActions := func() core.ActionSet { - return state.NextActions(clock.Now()) - } - - state.Monitor().Active(true) - - // Send initial scheduler request: - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) - - clockTick() - - // Set metrics so the desired resources are still 2 CU - metrics := core.SystemMetrics{ - LoadAverage1Min: 0.3, - MemoryUsageBytes: 0.0, - } - a.Do(state.UpdateSystemMetrics, metrics) - // Check that we agree about desired resources - a.Call(getDesiredResources, state, clock.Now()). - Equals(resForCU(2)) - // Check we've got nothing to do yet - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.8s")}, - }) - - clockTick() - - // Update the VM to set min=max=3 CU - a.Do(state.UpdatedVM, helpers.CreateVmInfo( - DefaultInitialStateConfig.VM, - helpers.WithCurrentCU(2), - helpers.WithMinMaxCU(3, 3), - )) - - // We should be making a plugin request to get upscaling: - a.Call(nextActions).Equals(core.ActionSet{ - PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(2)), - Target: resForCU(3), - Metrics: lo.ToPtr(metrics.ToAPI()), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) - clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resForCU(3), - Migrate: nil, - }) - // Do NeonVM request for the upscaling - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.9s")}, - NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(2), - Target: resForCU(3), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(3)) - clockTick() - a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - // Do vm-monitor upscale request - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.8s")}, - MonitorUpscale: &core.ActionMonitorUpscale{ - Current: resForCU(2), - Target: resForCU(3), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(3)) - clockTick() - a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) - // And then, we shouldn't need to do anything else: - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.7s")}, - }) -} - -// Checks that failed requests to the scheduler plugin and NeonVM API will be retried after a delay -func TestFailedRequestRetry(t *testing.T) { - a := helpers.NewAssert(t) - clock := helpers.NewFakeClock(t) - clockTick := func() { - clock.Inc(100 * time.Millisecond) - } - resForCU := DefaultComputeUnit.Mul - - state := helpers.CreateInitialState( - DefaultInitialStateConfig, - helpers.WithStoredWarnings(a.StoredWarnings()), - helpers.WithMinMaxCU(1, 2), - helpers.WithCurrentCU(1), - helpers.WithConfigSetting(func(c *core.Config) { - // Override values for consistency and ease of use - c.PluginRetryWait = duration("2s") - c.NeonVMRetryWait = duration("3s") - }), - ) - nextActions := func() core.ActionSet { - return state.NextActions(clock.Now()) - } - - state.Monitor().Active(true) - - // Send initial scheduler request - doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) - - // Set metrics so that we should be trying to upscale - clockTick() - metrics := core.SystemMetrics{ - LoadAverage1Min: 0.3, - MemoryUsageBytes: 0.0, - } - a.Do(state.UpdateSystemMetrics, metrics) - - // We should be asking the scheduler for upscaling - a.Call(nextActions).Equals(core.ActionSet{ - PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(1)), - Target: resForCU(2), - Metrics: lo.ToPtr(metrics.ToAPI()), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) - clockTick() - // On request failure, we retry after Config.PluginRetryWait - a.Do(state.Plugin().RequestFailed, clock.Now()) - a. - WithWarnings("Wanted to make a request to the scheduler plugin, but previous request failed too recently"). - Call(nextActions). - Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("2s")}, - }) - clock.Inc(duration("2s")) - // ... and then retry: - a.Call(nextActions).Equals(core.ActionSet{ - PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(1)), - Target: resForCU(2), - Metrics: lo.ToPtr(metrics.ToAPI()), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) - clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resForCU(2), - Migrate: nil, - }) - - // Now, after plugin request is successful, we should be making a request to NeonVM. - // We'll have that request fail the first time as well: - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin request tick - NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(1), - Target: resForCU(2), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) - clockTick() - // On request failure, we retry after Config.NeonVMRetryWait - a.Do(state.NeonVM().RequestFailed, clock.Now()) - a. - WithWarnings("Wanted to make a request to NeonVM API, but recent request failed too recently"). - Call(nextActions). - Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("3s")}, // NeonVM retry wait is less than current plugin request tick (4.8s remaining) - }) - clock.Inc(duration("3s")) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("1.8s")}, // plugin request tick - NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(1), - Target: resForCU(2), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) - clockTick() - a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - - // And then finally, we should be looking to inform the vm-monitor about this upscaling. - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("1.7s")}, // plugin request tick - MonitorUpscale: &core.ActionMonitorUpscale{ - Current: resForCU(1), - Target: resForCU(2), - DesiredLogicalTime: nil, - }, - }) -} - -// Checks that when metrics are updated during the downscaling process, between the NeonVM request -// and plugin request, we keep those processes mostly separate, without interference between them. +//// Test that in a stable state, requests to the plugin happen exactly every Config.PluginRequestTick +//func TestPeriodicPluginRequest(t *testing.T) { +// a := helpers.NewAssert(t) +// clock := helpers.NewFakeClock(t) // -// This is distilled from a bug found on staging that resulted in faulty requests to the plugin. -func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { - a := helpers.NewAssert(t) - clock := helpers.NewFakeClock(t) - clockTick := func() { - clock.Inc(100 * time.Millisecond) - } - resForCU := DefaultComputeUnit.Mul - - state := helpers.CreateInitialState( - DefaultInitialStateConfig, - helpers.WithStoredWarnings(a.StoredWarnings()), - // NOTE: current CU is greater than max CU. This is in line with what happens when - // unassigned pooled VMs created by the control plane are first assigned and endpoint and - // must immediately scale down. - helpers.WithMinMaxCU(1, 2), - helpers.WithCurrentCU(3), - ) - nextActions := func() core.ActionSet { - return state.NextActions(clock.Now()) - } - - // Send initial scheduler request - without the monitor active, so we're stuck at 4 CU for now - a. - WithWarnings("Wanted to send vm-monitor downscale request, but there's no active connection"). - Call(state.NextActions, clock.Now()). - Equals(core.ActionSet{ - PluginRequest: &core.ActionPluginRequest{ - LastPermit: nil, - Target: resForCU(3), - Metrics: nil, - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) - clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resForCU(3), - Migrate: nil, - }) - - clockTick() - - // Monitor's now active, so we should be asking it for downscaling. - // We don't yet have metrics though, so we only want to downscale as much as is required. - state.Monitor().Active(true) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.8s")}, - MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(3), - Target: resForCU(2), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(2)) - - // In the middle of the vm-monitor request, update the metrics so that now the desired resource - // usage is actually 1 CU - clockTick() - // the actual metrics we got in the actual logs - metrics := core.SystemMetrics{ - LoadAverage1Min: 0.0, - MemoryUsageBytes: 150589570, // 143.6 MiB - } - a.Do(state.UpdateSystemMetrics, metrics) - - // nothing to do yet, until the existing vm-monitor request finishes - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.7s")}, // plugin request tick wait - }) - - clockTick() - - // When the vm-monitor request finishes, we want to both - // (a) request additional downscaling from vm-monitor, and - // (b) make a NeonVM request for the initially approved downscaling - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.6s")}, // plugin request tick wait - NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(3), - Target: resForCU(2), - DesiredLogicalTime: nil, - }, - MonitorDownscale: &core.ActionMonitorDownscale{ - Current: resForCU(2), - Target: resForCU(1), - DesiredLogicalTime: nil, - }, - }) - // Start both requests. The vm-monitor request will finish first, but after that we'll just be - // waiting on the NeonVM request (and then redoing a follow-up for more downscaling). - a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) - a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) - - clockTick() - - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) - a. - WithWarnings( - "Wanted to make a request to NeonVM API, but there's already NeonVM request (for different resources) ongoing", - ). - Call(nextActions). - Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.5s")}, // plugin request tick wait - }) - - clockTick() - - a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - state.Debug(true) - a. - Call(nextActions). - Equals(core.ActionSet{ - // At this point in the original logs from staging, the intended request to the plugin was - // incorrectly for 1 CU, rather than 2 CU. So, the rest of this test case is mostly just - // rounding out the rest of the scale-down routine. - PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(3)), - Target: resForCU(2), - Metrics: lo.ToPtr(metrics.ToAPI()), - DesiredLogicalTime: nil, - }, - NeonVMRequest: &core.ActionNeonVMRequest{ - Current: resForCU(2), - Target: resForCU(1), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) - a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) - - clockTick() - - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resForCU(2), - Migrate: nil, - }) - // Still waiting for NeonVM request to complete - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin request tick wait - }) - - clockTick() - - // After the NeonVM request finishes, all that we have left to do is inform the plugin of the - // final downscaling. - a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - a.Call(nextActions).Equals(core.ActionSet{ - PluginRequest: &core.ActionPluginRequest{ - LastPermit: lo.ToPtr(resForCU(2)), - Target: resForCU(1), - Metrics: lo.ToPtr(metrics.ToAPI()), - DesiredLogicalTime: nil, - }, - }) - a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) - - clockTick() - - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ - Permit: resForCU(1), - Migrate: nil, - }) - // Nothing left to do - a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin request tick wait - }) -} +// state := helpers.CreateInitialState( +// DefaultInitialStateConfig, +// helpers.WithStoredWarnings(a.StoredWarnings()), +// ) +// +// state.Monitor().Active(true) +// +// metrics := core.SystemMetrics{ +// LoadAverage1Min: 0.0, +// MemoryUsageBytes: 0.0, +// } +// resources := DefaultComputeUnit +// +// a.Do(state.UpdateSystemMetrics, metrics) +// +// base := duration("0s") +// clock.Elapsed().AssertEquals(base) +// +// clockTick := duration("100ms") +// reqDuration := duration("50ms") +// reqEvery := DefaultInitialStateConfig.Core.PluginRequestTick +// endTime := duration("20s") +// +// doInitialPluginRequest(a, state, clock, clockTick, lo.ToPtr(metrics.ToAPI()), resources) +// +// for clock.Elapsed().Duration < endTime { +// timeSinceScheduledRequest := (clock.Elapsed().Duration - base) % reqEvery +// +// if timeSinceScheduledRequest != 0 { +// timeUntilNextRequest := reqEvery - timeSinceScheduledRequest +// a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: timeUntilNextRequest}, +// }) +// clock.Inc(clockTick) +// } else { +// a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: &resources, +// Target: resources, +// Metrics: lo.ToPtr(metrics.ToAPI()), +// TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resources) +// a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{}) +// clock.Inc(reqDuration) +// a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{}) +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resources, +// Migrate: nil, +// }) +// clock.Inc(clockTick - reqDuration) +// } +// } +//} +// +//// Checks that when downscaling is denied, we both (a) try again with higher resources, or (b) wait +//// to retry if there aren't higher resources to try with. +//func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { +// a := helpers.NewAssert(t) +// clock := helpers.NewFakeClock(t) +// clockTickDuration := duration("0.1s") +// clockTick := func() { +// clock.Inc(clockTickDuration) +// } +// resForCU := DefaultComputeUnit.Mul +// +// state := helpers.CreateInitialState( +// DefaultInitialStateConfig, +// helpers.WithStoredWarnings(a.StoredWarnings()), +// helpers.WithMinMaxCU(1, 8), +// helpers.WithCurrentCU(6), // NOTE: Start at 6 CU, so we're trying to scale down immediately. +// helpers.WithConfigSetting(func(c *core.Config) { +// // values close to the default, so request timing works out a little better. +// c.PluginRequestTick = duration("7s") +// c.MonitorDeniedDownscaleCooldown = duration("4s") +// }), +// ) +// +// nextActions := func() core.ActionSet { +// return state.NextActions(clock.Now()) +// } +// +// state.Monitor().Active(true) +// +// doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(6)) +// +// // Set metrics +// clockTick() +// metrics := core.SystemMetrics{ +// LoadAverage1Min: 0.0, +// MemoryUsageBytes: 0.0, +// } +// a.Do(state.UpdateSystemMetrics, metrics) +// // double-check that we agree about the desired resources +// a.Call(getDesiredResources, state, clock.Now()). +// Equals(resForCU(1)) +// +// // Broadly the idea here is that we should be trying to request downscaling from the vm-monitor, +// // and retrying with progressively higher values until either we get approved, or we run out of +// // options, at which point we should wait until later to re-request downscaling. +// // +// // This behavior results in linear retry passes. +// // +// // For this test, we: +// // 1. Deny any request in the first pass +// // 2. Approve only down to 3 CU on the second pass +// // a. triggers NeonVM request +// // b. triggers plugin request +// // 3. Deny all requests in the third pass (i.e. stay at 3 CU) +// // 4. Approve down to 1 CU on the fourth pass +// // a. triggers NeonVM request +// // b. triggers plugin request +// // +// // ---- +// // +// // First pass: deny downscaling. +// clock.Elapsed() +// +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("6.8s")}, +// MonitorDownscale: &core.ActionMonitorDownscale{ +// Current: resForCU(6), +// Target: resForCU(5), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(5)) +// clockTick() +// a.Do(state.Monitor().DownscaleRequestDenied, clock.Now()) +// +// // At the end, we should be waiting to retry downscaling: +// a.Call(nextActions).Equals(core.ActionSet{ +// // Taken from DefaultInitialStateConfig.Core.MonitorDeniedDownscaleCooldown +// Wait: &core.ActionWait{Duration: duration("4.0s")}, +// }) +// +// clock.Inc(duration("4s")) +// currentPluginWait := duration("2.7s") +// +// // Second pass: Approve only down to 3 CU, then NeonVM & plugin requests. +// for cu := uint16(5); cu >= 2; cu -= 1 { +// var expectedNeonVMRequest *core.ActionNeonVMRequest +// if cu < 5 { +// expectedNeonVMRequest = &core.ActionNeonVMRequest{ +// Current: resForCU(6), +// Target: resForCU(cu + 1), +// DesiredLogicalTime: nil, +// } +// } +// +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: currentPluginWait}, +// MonitorDownscale: &core.ActionMonitorDownscale{ +// Current: resForCU(cu + 1), +// Target: resForCU(cu), +// DesiredLogicalTime: nil, +// }, +// NeonVMRequest: expectedNeonVMRequest, +// }) +// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(cu)) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: currentPluginWait}, +// NeonVMRequest: expectedNeonVMRequest, +// }) +// clockTick() +// currentPluginWait -= clockTickDuration +// if cu >= 3 /* allow down to 3 */ { +// a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) +// } else { +// a.Do(state.Monitor().DownscaleRequestDenied, clock.Now()) +// } +// } +// // At this point, waiting 3.7s for next attempt to downscale below 3 CU (last request was +// // successful, but the one before it wasn't), and 0.8s for plugin tick. +// // Also, because downscaling was approved, we should want to make a NeonVM request to do that. +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("2.3s")}, +// NeonVMRequest: &core.ActionNeonVMRequest{ +// Current: resForCU(6), +// Target: resForCU(3), +// DesiredLogicalTime: nil, +// }, +// }) +// // Make the request: +// a.Do(state.NeonVM().StartingRequest, time.Now(), resForCU(3)) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("2.3s")}, +// }) +// clockTick() +// a.Do(state.NeonVM().RequestSuccessful, time.Now()) +// // Successfully scaled down, so we should now inform the plugin. But also, we'll want to retry +// // the downscale request to vm-monitor once the retry is up: +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("3.9s")}, +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: lo.ToPtr(resForCU(6)), +// Target: resForCU(3), +// Metrics: lo.ToPtr(metrics.ToAPI()), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("3.9s")}, +// }) +// clockTick() +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resForCU(3), +// Migrate: nil, +// }) +// // ... And *now* there's nothing left to do but wait until downscale wait expires: +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("3.8s")}, +// }) +// +// // so, wait for that: +// clock.Inc(duration("3.8s")) +// +// // Third pass: deny requested downscaling. +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("3.1s")}, +// MonitorDownscale: &core.ActionMonitorDownscale{ +// Current: resForCU(3), +// Target: resForCU(2), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(2)) +// clockTick() +// a.Do(state.Monitor().DownscaleRequestDenied, clock.Now()) +// // At the end, we should be waiting to retry downscaling (but actually, the regular plugin +// // request is coming up sooner). +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("3.0s")}, +// }) +// // ... so, wait for that plugin request/response, and then wait to retry downscaling: +// clock.Inc(duration("3s")) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("1s")}, // still want to retry vm-monitor downscaling +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: lo.ToPtr(resForCU(3)), +// Target: resForCU(3), +// Metrics: lo.ToPtr(metrics.ToAPI()), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("1s")}, // still waiting on retrying vm-monitor downscaling +// }) +// clockTick() +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resForCU(3), +// Migrate: nil, +// }) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("0.9s")}, // yep, still waiting on retrying vm-monitor downscaling +// }) +// +// clock.Inc(duration("0.9s")) +// +// // Fourth pass: approve down to 1 CU - wait to do the NeonVM requests until the end +// currentPluginWait = duration("6.0s") +// for cu := uint16(2); cu >= 1; cu -= 1 { +// var expectedNeonVMRequest *core.ActionNeonVMRequest +// if cu < 2 { +// expectedNeonVMRequest = &core.ActionNeonVMRequest{ +// Current: resForCU(3), +// Target: resForCU(cu + 1), +// DesiredLogicalTime: nil, +// } +// } +// +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: currentPluginWait}, +// MonitorDownscale: &core.ActionMonitorDownscale{ +// Current: resForCU(cu + 1), +// Target: resForCU(cu), +// DesiredLogicalTime: nil, +// }, +// NeonVMRequest: expectedNeonVMRequest, +// }) +// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(cu)) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: currentPluginWait}, +// NeonVMRequest: expectedNeonVMRequest, +// }) +// clockTick() +// currentPluginWait -= clockTickDuration +// a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) +// } +// // Still waiting on plugin request tick, but we can make a NeonVM request to enact the +// // downscaling right away ! +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("5.8s")}, +// NeonVMRequest: &core.ActionNeonVMRequest{ +// Current: resForCU(3), +// Target: resForCU(1), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.NeonVM().StartingRequest, time.Now(), resForCU(1)) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("5.8s")}, // yep, still waiting on the plugin +// }) +// clockTick() +// a.Do(state.NeonVM().RequestSuccessful, time.Now()) +// // Successfully downscaled, so now we should inform the plugin. Not waiting on any retries. +// a.Call(nextActions).Equals(core.ActionSet{ +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: lo.ToPtr(resForCU(3)), +// Target: resForCU(1), +// Metrics: lo.ToPtr(metrics.ToAPI()), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) +// a.Call(nextActions).Equals(core.ActionSet{ +// // not waiting on anything! +// }) +// clockTick() +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resForCU(1), +// Migrate: nil, +// }) +// // And now there's truly nothing left to do. Back to waiting on plugin request tick :) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("6.9s")}, +// }) +//} +// +//// Checks that we scale up in a timely manner when the vm-monitor requests it, and don't request +//// downscaling until the time expires. +//func TestRequestedUpscale(t *testing.T) { +// a := helpers.NewAssert(t) +// clock := helpers.NewFakeClock(t) +// clockTick := func() { +// clock.Inc(100 * time.Millisecond) +// } +// resForCU := DefaultComputeUnit.Mul +// +// state := helpers.CreateInitialState( +// DefaultInitialStateConfig, +// helpers.WithStoredWarnings(a.StoredWarnings()), +// helpers.WithConfigSetting(func(c *core.Config) { +// c.MonitorRequestedUpscaleValidPeriod = duration("6s") // Override this for consistency +// }), +// ) +// nextActions := func() core.ActionSet { +// return state.NextActions(clock.Now()) +// } +// +// state.Monitor().Active(true) +// +// // Send initial scheduler request: +// doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) +// +// // Set metrics +// clockTick() +// lastMetrics := core.SystemMetrics{ +// LoadAverage1Min: 0.0, +// MemoryUsageBytes: 0.0, +// } +// a.Do(state.UpdateSystemMetrics, lastMetrics) +// +// // Check we're not supposed to do anything +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.8s")}, +// }) +// +// // Have the vm-monitor request upscaling: +// a.Do(state.Monitor().UpscaleRequested, clock.Now(), api.MoreResources{Cpu: false, Memory: true}) +// // First need to check with the scheduler plugin to get approval for upscaling: +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("6s")}, // if nothing else happens, requested upscale expires. +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: lo.ToPtr(resForCU(1)), +// Target: resForCU(2), +// Metrics: lo.ToPtr(lastMetrics.ToAPI()), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) +// clockTick() +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("5.9s")}, // same waiting for requested upscale expiring +// }) +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resForCU(2), +// Migrate: nil, +// }) +// +// // After approval from the scheduler plugin, now need to make NeonVM request: +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin tick wait is earlier than requested upscale expiration +// NeonVMRequest: &core.ActionNeonVMRequest{ +// Current: resForCU(1), +// Target: resForCU(2), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) +// clockTick() +// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) +// +// // Finally, tell the vm-monitor that it got upscaled: +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.8s")}, // still waiting on plugin tick +// MonitorUpscale: &core.ActionMonitorUpscale{ +// Current: resForCU(1), +// Target: resForCU(2), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) +// clockTick() +// a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) +// +// // After everything, we should be waiting on both: +// // (a) scheduler plugin tick (4.7s remaining), and +// // (b) vm-monitor requested upscaling expiring (5.7s remaining) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.7s")}, +// }) +// +// // Do the routine scheduler plugin request. Still waiting 1s for vm-monitor request expiration +// clock.Inc(duration("4.7s")) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("1s")}, +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: lo.ToPtr(resForCU(2)), +// Target: resForCU(2), +// Metrics: lo.ToPtr(lastMetrics.ToAPI()), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) +// clockTick() +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("0.9s")}, // waiting for requested upscale expiring +// }) +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resForCU(2), +// Migrate: nil, +// }) +// +// // Still should just be waiting on vm-monitor upscale expiring +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("0.9s")}, +// }) +// clock.Inc(duration("0.9s")) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4s")}, // now, waiting on plugin request tick +// MonitorDownscale: &core.ActionMonitorDownscale{ +// Current: resForCU(2), +// Target: resForCU(1), +// DesiredLogicalTime: nil, +// }, +// }) +//} +// +//// Checks that if we get new metrics partway through downscaling, then we pivot back to upscaling +//// without further requests in furtherance of downscaling. +//// +//// For example, if we pivot during the NeonVM request to do the downscaling, then the request to to +//// the scheduler plugin should never be made, because we decided against downscaling. +//func TestDownscalePivotBack(t *testing.T) { +// a := helpers.NewAssert(t) +// var clock *helpers.FakeClock +// +// clockTickDuration := duration("0.1s") +// clockTick := func() helpers.Elapsed { +// return clock.Inc(clockTickDuration) +// } +// halfClockTick := func() helpers.Elapsed { +// return clock.Inc(clockTickDuration / 2) +// } +// resForCU := DefaultComputeUnit.Mul +// +// var state *core.State +// nextActions := func() core.ActionSet { +// return state.NextActions(clock.Now()) +// } +// +// initialMetrics := core.SystemMetrics{ +// LoadAverage1Min: 0.0, +// MemoryUsageBytes: 0.0, +// } +// newMetrics := core.SystemMetrics{ +// LoadAverage1Min: 0.3, +// MemoryUsageBytes: 0.0, +// } +// +// steps := []struct { +// pre func(pluginWait *time.Duration, midRequest func()) +// post func(pluginWait *time.Duration) +// }{ +// // vm-monitor requests: +// { +// pre: func(pluginWait *time.Duration, midRequest func()) { +// t.Log(" > start vm-monitor downscale") +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: *pluginWait}, +// MonitorDownscale: &core.ActionMonitorDownscale{ +// Current: resForCU(2), +// Target: resForCU(1), +// +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) +// halfClockTick() +// midRequest() +// halfClockTick() +// *pluginWait -= clockTickDuration +// t.Log(" > finish vm-monitor downscale") +// a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) +// }, +// post: func(pluginWait *time.Duration) { +// t.Log(" > start vm-monitor upscale") +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: *pluginWait}, +// MonitorUpscale: &core.ActionMonitorUpscale{ +// Current: resForCU(1), +// Target: resForCU(2), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) +// clockTick() +// *pluginWait -= clockTickDuration +// t.Log(" > finish vm-monitor upscale") +// a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) +// }, +// }, +// // NeonVM requests +// { +// pre: func(pluginWait *time.Duration, midRequest func()) { +// t.Log(" > start NeonVM downscale") +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: *pluginWait}, +// NeonVMRequest: &core.ActionNeonVMRequest{ +// Current: resForCU(2), +// Target: resForCU(1), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) +// halfClockTick() +// midRequest() +// halfClockTick() +// *pluginWait -= clockTickDuration +// t.Log(" > finish NeonVM downscale") +// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) +// }, +// post: func(pluginWait *time.Duration) { +// t.Log(" > start NeonVM upscale") +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: *pluginWait}, +// NeonVMRequest: &core.ActionNeonVMRequest{ +// Current: resForCU(1), +// Target: resForCU(2), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) +// clockTick() +// *pluginWait -= clockTickDuration +// t.Log(" > finish NeonVM upscale") +// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) +// }, +// }, +// // plugin requests +// { +// pre: func(pluginWait *time.Duration, midRequest func()) { +// t.Log(" > start plugin downscale") +// a.Call(nextActions).Equals(core.ActionSet{ +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: lo.ToPtr(resForCU(2)), +// Target: resForCU(1), +// Metrics: lo.ToPtr(initialMetrics.ToAPI()), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) +// halfClockTick() +// midRequest() +// halfClockTick() +// *pluginWait = duration("4.9s") // reset because we just made a request +// t.Log(" > finish plugin downscale") +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resForCU(1), +// Migrate: nil, +// }) +// }, +// post: func(pluginWait *time.Duration) { +// t.Log(" > start plugin upscale") +// a.Call(nextActions).Equals(core.ActionSet{ +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: lo.ToPtr(resForCU(1)), +// Target: resForCU(2), +// Metrics: lo.ToPtr(newMetrics.ToAPI()), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) +// clockTick() +// *pluginWait = duration("4.9s") // reset because we just made a request +// t.Log(" > finish plugin upscale") +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resForCU(2), +// Migrate: nil, +// }) +// }, +// }, +// } +// +// for i := 0; i < len(steps); i++ { +// t.Logf("iter(%d)", i) +// +// // Initial setup +// clock = helpers.NewFakeClock(t) +// state = helpers.CreateInitialState( +// DefaultInitialStateConfig, +// helpers.WithStoredWarnings(a.StoredWarnings()), +// helpers.WithMinMaxCU(1, 3), +// helpers.WithCurrentCU(2), +// ) +// +// state.Monitor().Active(true) +// +// doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) +// +// clockTick().AssertEquals(duration("0.2s")) +// pluginWait := duration("4.8s") +// +// a.Do(state.UpdateSystemMetrics, initialMetrics) +// // double-check that we agree about the desired resources +// a.Call(getDesiredResources, state, clock.Now()). +// Equals(resForCU(1)) +// +// for j := 0; j <= i; j++ { +// midRequest := func() {} +// if j == i { +// // at the midpoint, start backtracking by setting the metrics +// midRequest = func() { +// t.Log(" > > updating metrics mid-request") +// a.Do(state.UpdateSystemMetrics, newMetrics) +// a.Call(getDesiredResources, state, clock.Now()). +// Equals(resForCU(2)) +// } +// } +// +// steps[j].pre(&pluginWait, midRequest) +// } +// +// for j := i; j >= 0; j-- { +// steps[j].post(&pluginWait) +// } +// } +//} +// +//// Checks that if the VM's min/max bounds change so that the maximum is below the current and +//// desired usage, we try to downscale +//func TestBoundsChangeRequiresDownsale(t *testing.T) { +// a := helpers.NewAssert(t) +// clock := helpers.NewFakeClock(t) +// clockTick := func() { +// clock.Inc(100 * time.Millisecond) +// } +// resForCU := DefaultComputeUnit.Mul +// +// state := helpers.CreateInitialState( +// DefaultInitialStateConfig, +// helpers.WithStoredWarnings(a.StoredWarnings()), +// helpers.WithMinMaxCU(1, 3), +// helpers.WithCurrentCU(2), +// ) +// nextActions := func() core.ActionSet { +// return state.NextActions(clock.Now()) +// } +// +// state.Monitor().Active(true) +// +// // Send initial scheduler request: +// doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) +// +// clockTick() +// +// // Set metrics so the desired resources are still 2 CU +// metrics := core.SystemMetrics{ +// LoadAverage1Min: 0.3, +// MemoryUsageBytes: 0.0, +// } +// a.Do(state.UpdateSystemMetrics, metrics) +// // Check that we agree about desired resources +// a.Call(getDesiredResources, state, clock.Now()). +// Equals(resForCU(2)) +// // Check we've got nothing to do yet +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.8s")}, +// }) +// +// clockTick() +// +// // Update the VM to set min=max=1 CU +// a.Do(state.UpdatedVM, helpers.CreateVmInfo( +// DefaultInitialStateConfig.VM, +// helpers.WithCurrentCU(2), +// helpers.WithMinMaxCU(1, 1), +// )) +// +// // We should be making a vm-monitor downscaling request +// // TODO: In the future, we should have a "force-downscale" alternative so the vm-monitor doesn't +// // get to deny the downscaling. +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.7s")}, +// MonitorDownscale: &core.ActionMonitorDownscale{ +// Current: resForCU(2), +// Target: resForCU(1), +// +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) +// clockTick() +// a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) +// // Do NeonVM request for that downscaling +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.6s")}, +// NeonVMRequest: &core.ActionNeonVMRequest{ +// Current: resForCU(2), +// Target: resForCU(1), +// +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) +// clockTick() +// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) +// // Do plugin request for that downscaling: +// a.Call(nextActions).Equals(core.ActionSet{ +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: lo.ToPtr(resForCU(2)), +// Target: resForCU(1), +// Metrics: lo.ToPtr(metrics.ToAPI()), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) +// clockTick() +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resForCU(1), +// Migrate: nil, +// }) +// // And then, we shouldn't need to do anything else: +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.9s")}, +// }) +//} +// +//// Checks that if the VM's min/max bounds change so that the minimum is above the current and +//// desired usage, we try to upscale +//func TestBoundsChangeRequiresUpscale(t *testing.T) { +// a := helpers.NewAssert(t) +// clock := helpers.NewFakeClock(t) +// clockTick := func() { +// clock.Inc(100 * time.Millisecond) +// } +// resForCU := DefaultComputeUnit.Mul +// +// state := helpers.CreateInitialState( +// DefaultInitialStateConfig, +// helpers.WithStoredWarnings(a.StoredWarnings()), +// helpers.WithMinMaxCU(1, 3), +// helpers.WithCurrentCU(2), +// ) +// nextActions := func() core.ActionSet { +// return state.NextActions(clock.Now()) +// } +// +// state.Monitor().Active(true) +// +// // Send initial scheduler request: +// doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) +// +// clockTick() +// +// // Set metrics so the desired resources are still 2 CU +// metrics := core.SystemMetrics{ +// LoadAverage1Min: 0.3, +// MemoryUsageBytes: 0.0, +// } +// a.Do(state.UpdateSystemMetrics, metrics) +// // Check that we agree about desired resources +// a.Call(getDesiredResources, state, clock.Now()). +// Equals(resForCU(2)) +// // Check we've got nothing to do yet +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.8s")}, +// }) +// +// clockTick() +// +// // Update the VM to set min=max=3 CU +// a.Do(state.UpdatedVM, helpers.CreateVmInfo( +// DefaultInitialStateConfig.VM, +// helpers.WithCurrentCU(2), +// helpers.WithMinMaxCU(3, 3), +// )) +// +// // We should be making a plugin request to get upscaling: +// a.Call(nextActions).Equals(core.ActionSet{ +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: lo.ToPtr(resForCU(2)), +// Target: resForCU(3), +// Metrics: lo.ToPtr(metrics.ToAPI()), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) +// clockTick() +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resForCU(3), +// Migrate: nil, +// }) +// // Do NeonVM request for the upscaling +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.9s")}, +// NeonVMRequest: &core.ActionNeonVMRequest{ +// Current: resForCU(2), +// Target: resForCU(3), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(3)) +// clockTick() +// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) +// // Do vm-monitor upscale request +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.8s")}, +// MonitorUpscale: &core.ActionMonitorUpscale{ +// Current: resForCU(2), +// Target: resForCU(3), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(3)) +// clockTick() +// a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) +// // And then, we shouldn't need to do anything else: +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.7s")}, +// }) +//} +// +//// Checks that failed requests to the scheduler plugin and NeonVM API will be retried after a delay +//func TestFailedRequestRetry(t *testing.T) { +// a := helpers.NewAssert(t) +// clock := helpers.NewFakeClock(t) +// clockTick := func() { +// clock.Inc(100 * time.Millisecond) +// } +// resForCU := DefaultComputeUnit.Mul +// +// state := helpers.CreateInitialState( +// DefaultInitialStateConfig, +// helpers.WithStoredWarnings(a.StoredWarnings()), +// helpers.WithMinMaxCU(1, 2), +// helpers.WithCurrentCU(1), +// helpers.WithConfigSetting(func(c *core.Config) { +// // Override values for consistency and ease of use +// c.PluginRetryWait = duration("2s") +// c.NeonVMRetryWait = duration("3s") +// }), +// ) +// nextActions := func() core.ActionSet { +// return state.NextActions(clock.Now()) +// } +// +// state.Monitor().Active(true) +// +// // Send initial scheduler request +// doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) +// +// // Set metrics so that we should be trying to upscale +// clockTick() +// metrics := core.SystemMetrics{ +// LoadAverage1Min: 0.3, +// MemoryUsageBytes: 0.0, +// } +// a.Do(state.UpdateSystemMetrics, metrics) +// +// // We should be asking the scheduler for upscaling +// a.Call(nextActions).Equals(core.ActionSet{ +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: lo.ToPtr(resForCU(1)), +// Target: resForCU(2), +// Metrics: lo.ToPtr(metrics.ToAPI()), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) +// clockTick() +// // On request failure, we retry after Config.PluginRetryWait +// a.Do(state.Plugin().RequestFailed, clock.Now()) +// a. +// WithWarnings("Wanted to make a request to the scheduler plugin, but previous request failed too recently"). +// Call(nextActions). +// Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("2s")}, +// }) +// clock.Inc(duration("2s")) +// // ... and then retry: +// a.Call(nextActions).Equals(core.ActionSet{ +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: lo.ToPtr(resForCU(1)), +// Target: resForCU(2), +// Metrics: lo.ToPtr(metrics.ToAPI()), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) +// clockTick() +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resForCU(2), +// Migrate: nil, +// }) +// +// // Now, after plugin request is successful, we should be making a request to NeonVM. +// // We'll have that request fail the first time as well: +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin request tick +// NeonVMRequest: &core.ActionNeonVMRequest{ +// Current: resForCU(1), +// Target: resForCU(2), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) +// clockTick() +// // On request failure, we retry after Config.NeonVMRetryWait +// a.Do(state.NeonVM().RequestFailed, clock.Now()) +// a. +// WithWarnings("Wanted to make a request to NeonVM API, but recent request failed too recently"). +// Call(nextActions). +// Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("3s")}, // NeonVM retry wait is less than current plugin request tick (4.8s remaining) +// }) +// clock.Inc(duration("3s")) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("1.8s")}, // plugin request tick +// NeonVMRequest: &core.ActionNeonVMRequest{ +// Current: resForCU(1), +// Target: resForCU(2), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) +// clockTick() +// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) +// +// // And then finally, we should be looking to inform the vm-monitor about this upscaling. +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("1.7s")}, // plugin request tick +// MonitorUpscale: &core.ActionMonitorUpscale{ +// Current: resForCU(1), +// Target: resForCU(2), +// DesiredLogicalTime: nil, +// }, +// }) +//} +// +//// Checks that when metrics are updated during the downscaling process, between the NeonVM request +//// and plugin request, we keep those processes mostly separate, without interference between them. +//// +//// This is distilled from a bug found on staging that resulted in faulty requests to the plugin. +//func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { +// a := helpers.NewAssert(t) +// clock := helpers.NewFakeClock(t) +// clockTick := func() { +// clock.Inc(100 * time.Millisecond) +// } +// resForCU := DefaultComputeUnit.Mul +// +// state := helpers.CreateInitialState( +// DefaultInitialStateConfig, +// helpers.WithStoredWarnings(a.StoredWarnings()), +// // NOTE: current CU is greater than max CU. This is in line with what happens when +// // unassigned pooled VMs created by the control plane are first assigned and endpoint and +// // must immediately scale down. +// helpers.WithMinMaxCU(1, 2), +// helpers.WithCurrentCU(3), +// ) +// nextActions := func() core.ActionSet { +// return state.NextActions(clock.Now()) +// } +// +// // Send initial scheduler request - without the monitor active, so we're stuck at 4 CU for now +// a. +// WithWarnings("Wanted to send vm-monitor downscale request, but there's no active connection"). +// Call(state.NextActions, clock.Now()). +// Equals(core.ActionSet{ +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: nil, +// Target: resForCU(3), +// Metrics: nil, +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) +// clockTick() +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resForCU(3), +// Migrate: nil, +// }) +// +// clockTick() +// +// // Monitor's now active, so we should be asking it for downscaling. +// // We don't yet have metrics though, so we only want to downscale as much as is required. +// state.Monitor().Active(true) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.8s")}, +// MonitorDownscale: &core.ActionMonitorDownscale{ +// Current: resForCU(3), +// Target: resForCU(2), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(2)) +// +// // In the middle of the vm-monitor request, update the metrics so that now the desired resource +// // usage is actually 1 CU +// clockTick() +// // the actual metrics we got in the actual logs +// metrics := core.SystemMetrics{ +// LoadAverage1Min: 0.0, +// MemoryUsageBytes: 150589570, // 143.6 MiB +// } +// a.Do(state.UpdateSystemMetrics, metrics) +// +// // nothing to do yet, until the existing vm-monitor request finishes +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.7s")}, // plugin request tick wait +// }) +// +// clockTick() +// +// // When the vm-monitor request finishes, we want to both +// // (a) request additional downscaling from vm-monitor, and +// // (b) make a NeonVM request for the initially approved downscaling +// a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.6s")}, // plugin request tick wait +// NeonVMRequest: &core.ActionNeonVMRequest{ +// Current: resForCU(3), +// Target: resForCU(2), +// DesiredLogicalTime: nil, +// }, +// MonitorDownscale: &core.ActionMonitorDownscale{ +// Current: resForCU(2), +// Target: resForCU(1), +// DesiredLogicalTime: nil, +// }, +// }) +// // Start both requests. The vm-monitor request will finish first, but after that we'll just be +// // waiting on the NeonVM request (and then redoing a follow-up for more downscaling). +// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) +// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) +// +// clockTick() +// +// a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) +// a. +// WithWarnings( +// "Wanted to make a request to NeonVM API, but there's already NeonVM request (for different resources) ongoing", +// ). +// Call(nextActions). +// Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.5s")}, // plugin request tick wait +// }) +// +// clockTick() +// +// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) +// state.Debug(true) +// a. +// Call(nextActions). +// Equals(core.ActionSet{ +// // At this point in the original logs from staging, the intended request to the plugin was +// // incorrectly for 1 CU, rather than 2 CU. So, the rest of this test case is mostly just +// // rounding out the rest of the scale-down routine. +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: lo.ToPtr(resForCU(3)), +// Target: resForCU(2), +// Metrics: lo.ToPtr(metrics.ToAPI()), +// DesiredLogicalTime: nil, +// }, +// NeonVMRequest: &core.ActionNeonVMRequest{ +// Current: resForCU(2), +// Target: resForCU(1), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) +// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) +// +// clockTick() +// +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resForCU(2), +// Migrate: nil, +// }) +// // Still waiting for NeonVM request to complete +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin request tick wait +// }) +// +// clockTick() +// +// // After the NeonVM request finishes, all that we have left to do is inform the plugin of the +// // final downscaling. +// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) +// a.Call(nextActions).Equals(core.ActionSet{ +// PluginRequest: &core.ActionPluginRequest{ +// LastPermit: lo.ToPtr(resForCU(2)), +// Target: resForCU(1), +// Metrics: lo.ToPtr(metrics.ToAPI()), +// DesiredLogicalTime: nil, +// }, +// }) +// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) +// +// clockTick() +// +// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ +// Permit: resForCU(1), +// Migrate: nil, +// }) +// // Nothing left to do +// a.Call(nextActions).Equals(core.ActionSet{ +// Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin request tick wait +// }) +//} diff --git a/pkg/agent/core/testhelpers/construct.go b/pkg/agent/core/testhelpers/construct.go index 301807f13..eb67a65f5 100644 --- a/pkg/agent/core/testhelpers/construct.go +++ b/pkg/agent/core/testhelpers/construct.go @@ -8,7 +8,6 @@ import ( vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "github.com/neondatabase/autoscaling/pkg/agent/core" - "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" "github.com/neondatabase/autoscaling/pkg/api" ) @@ -37,24 +36,13 @@ type VmInfoOpt interface { modifyVmInfoWithConfig(InitialVmInfoConfig, *api.VmInfo) } -type ClockSourceOpt interface { - InitialStateOpt - - clock() core.LogicClock -} - func CreateInitialState(config InitialStateConfig, opts ...InitialStateOpt) *core.State { vmOpts := []VmInfoOpt{} - var clock core.LogicClock - clock = &logiclock.NilClock{} for _, o := range opts { if vo, ok := o.(VmInfoOpt); ok { vmOpts = append(vmOpts, vo) } - if co, ok := o.(ClockSourceOpt); ok { - clock = co.clock() - } } vm := CreateVmInfo(config.VM, vmOpts...) @@ -63,7 +51,7 @@ func CreateInitialState(config InitialStateConfig, opts ...InitialStateOpt) *cor o.modifyStateConfig(&config.Core) } - return core.NewState(vm, config.Core, clock) + return core.NewState(vm, config.Core) } func CreateVmInfo(config InitialVmInfoConfig, opts ...VmInfoOpt) api.VmInfo { @@ -99,7 +87,7 @@ func CreateVmInfo(config InitialVmInfoConfig, opts ...VmInfoOpt) api.VmInfo { ScalingConfig: nil, ScalingEnabled: true, }, - CurrentLogicalTime: nil, + CurrentRevision: nil, } for _, o := range opts { @@ -112,7 +100,6 @@ func CreateVmInfo(config InitialVmInfoConfig, opts ...VmInfoOpt) api.VmInfo { type coreConfigModifier func(*core.Config) type vmInfoConfigModifier func(*InitialVmInfoConfig) type vmInfoModifier func(InitialVmInfoConfig, *api.VmInfo) -type clockInjector func() core.LogicClock var ( _ VmInfoOpt = vmInfoConfigModifier(nil) @@ -133,11 +120,6 @@ func (m vmInfoModifier) modifyVmInfoWithConfig(c InitialVmInfoConfig, vm *api.Vm (func(InitialVmInfoConfig, *api.VmInfo))(m)(c, vm) } -func (m clockInjector) modifyStateConfig(*core.Config) {} -func (m clockInjector) clock() core.LogicClock { - return m() -} - func WithConfigSetting(f func(*core.Config)) InitialStateOpt { return coreConfigModifier(f) } @@ -178,15 +160,3 @@ func WithCurrentCU(cu uint16) VmInfoOpt { vm.SetUsing(c.ComputeUnit.Mul(cu)) }) } - -func WithLogicalTime(t *vmapi.LogicalTime) VmInfoOpt { - return vmInfoModifier(func(c InitialVmInfoConfig, vm *api.VmInfo) { - vm.CurrentLogicalTime = t - }) -} - -func WithClock(c core.LogicClock) ClockSourceOpt { - return clockInjector(func() core.LogicClock { - return c - }) -} diff --git a/pkg/agent/execbridge.go b/pkg/agent/execbridge.go index b9a503bd4..60a0136cd 100644 --- a/pkg/agent/execbridge.go +++ b/pkg/agent/execbridge.go @@ -91,11 +91,11 @@ func (iface *execNeonVMInterface) Request( ctx context.Context, logger *zap.Logger, current, target api.Resources, - desiredLogicalTime *vmv1.LogicalTime, + currentRevision vmv1.RevisionWithTime, ) error { iface.runner.recordResourceChange(current, target, iface.runner.global.metrics.neonvmRequestedChange) - err := iface.runner.doNeonVMRequest(ctx, target, desiredLogicalTime) + err := iface.runner.doNeonVMRequest(ctx, target, currentRevision) if err != nil { iface.runner.status.update(iface.runner.global, func(ps podStatus) podStatus { ps.failedNeonVMRequestCounter.Inc() diff --git a/pkg/agent/executor/core.go b/pkg/agent/executor/core.go index 9e160d34f..ba9f5e9d8 100644 --- a/pkg/agent/executor/core.go +++ b/pkg/agent/executor/core.go @@ -19,7 +19,6 @@ import ( "go.uber.org/zap" "github.com/neondatabase/autoscaling/pkg/agent/core" - "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" "github.com/neondatabase/autoscaling/pkg/api" "github.com/neondatabase/autoscaling/pkg/util" ) @@ -58,12 +57,11 @@ func NewExecutorCore( stateLogger *zap.Logger, vm api.VmInfo, config Config, - clockSource *logiclock.Clock, ) *ExecutorCore { return &ExecutorCore{ mu: sync.Mutex{}, stateLogger: stateLogger, - core: core.NewState(vm, config.Core, clockSource), + core: core.NewState(vm, config.Core), actions: nil, // (*ExecutorCore).getActions() checks if this is nil lastActionsID: -1, onNextActions: config.OnNextActions, diff --git a/pkg/agent/executor/exec_monitor.go b/pkg/agent/executor/exec_monitor.go index 8595c7795..882d262a6 100644 --- a/pkg/agent/executor/exec_monitor.go +++ b/pkg/agent/executor/exec_monitor.go @@ -99,15 +99,14 @@ func (c *ExecutorCoreWithClients) DoMonitorDownscales(ctx context.Context, logge if !result.Ok { logger.Warn("vm-monitor denied downscale", logFields...) if unchanged { - state.Monitor().DownscaleRequestDenied(endTime) + state.Monitor().DownscaleRequestDenied(endTime, action.TargetRevision) } else { warnSkipBecauseChanged() } } else { logger.Info("vm-monitor approved downscale", logFields...) if unchanged { - state.Monitor().DownscaleRequestAllowed(endTime) - state.Monitor().UpdateLogicalTime(action.DesiredLogicalTime.Rewind(endTime)) + state.Monitor().DownscaleRequestAllowed(endTime, action.TargetRevision) } else { warnSkipBecauseChanged() } @@ -188,7 +187,6 @@ func (c *ExecutorCoreWithClients) DoMonitorUpscales(ctx context.Context, logger logger.Info("vm-monitor upscale request successful", logFields...) if unchanged { state.Monitor().UpscaleRequestSuccessful(endTime) - state.Monitor().UpdateLogicalTime(action.DesiredLogicalTime.Rewind(endTime)) } else { warnSkipBecauseChanged() } diff --git a/pkg/agent/executor/exec_neonvm.go b/pkg/agent/executor/exec_neonvm.go index 442360019..358f6a4c8 100644 --- a/pkg/agent/executor/exec_neonvm.go +++ b/pkg/agent/executor/exec_neonvm.go @@ -17,7 +17,7 @@ type NeonVMInterface interface { _ context.Context, _ *zap.Logger, current, target api.Resources, - desiredLogicalTime *vmv1.LogicalTime, + currentRevision vmv1.RevisionWithTime, ) error } @@ -52,8 +52,10 @@ func (c *ExecutorCoreWithClients) DoNeonVMRequests(ctx context.Context, logger * continue // state has changed, retry. } - err := c.clients.NeonVM.Request(ctx, ifaceLogger, action.Current, action.Target, action.DesiredLogicalTime) endTime := time.Now() + currentRevision := action.TargetRevision.WithTime(endTime) + err := c.clients.NeonVM.Request(ctx, ifaceLogger, action.Current, action.Target, currentRevision) + logFields := []zap.Field{zap.Object("action", action), zap.Duration("duration", endTime.Sub(startTime))} c.update(func(state *core.State) { diff --git a/pkg/agent/executor/exec_plugin.go b/pkg/agent/executor/exec_plugin.go index d05a88e1b..e9abd884b 100644 --- a/pkg/agent/executor/exec_plugin.go +++ b/pkg/agent/executor/exec_plugin.go @@ -61,7 +61,7 @@ func (c *ExecutorCoreWithClients) DoPluginRequests(ctx context.Context, logger * } else { logFields = append(logFields, zap.Any("response", resp)) logger.Info("Plugin request successful", logFields...) - if err := state.Plugin().RequestSuccessful(endTime, action.DesiredLogicalTime, *resp); err != nil { + if err := state.Plugin().RequestSuccessful(endTime, action.TargetRevision, *resp); err != nil { logger.Error("Plugin response validation failed", append(logFields, zap.Error(err))...) } } diff --git a/pkg/agent/prommetrics.go b/pkg/agent/prommetrics.go index 41ffcb6aa..ebead8c46 100644 --- a/pkg/agent/prommetrics.go +++ b/pkg/agent/prommetrics.go @@ -4,7 +4,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/collectors" - "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" + "github.com/neondatabase/autoscaling/pkg/agent/core/revsource" "github.com/neondatabase/autoscaling/pkg/util" ) @@ -225,7 +225,7 @@ func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { prometheus.HistogramOpts{ Name: "autoscaling_agent_scaling_latency_seconds", Help: "End-to-end scaling latency", - }, logiclock.AllFlagNames, + }, revsource.AllFlagNames, )), } diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go index 5f889a186..3ec92bfe0 100644 --- a/pkg/agent/runner.go +++ b/pkg/agent/runner.go @@ -33,7 +33,7 @@ import ( vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "github.com/neondatabase/autoscaling/pkg/agent/core" - "github.com/neondatabase/autoscaling/pkg/agent/core/logiclock" + "github.com/neondatabase/autoscaling/pkg/agent/core/revsource" "github.com/neondatabase/autoscaling/pkg/agent/executor" "github.com/neondatabase/autoscaling/pkg/agent/schedwatch" "github.com/neondatabase/autoscaling/pkg/api" @@ -196,9 +196,9 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util pluginRequestJitter := util.NewTimeRange(time.Millisecond, 0, 100).Random() coreExecLogger := execLogger.Named("core") - clock := logiclock.NewClock(func(duration time.Duration, flags logiclock.Flag) { + revisionSource := revsource.NewRevisionSource(func(duration time.Duration, flags vmv1.Flag) { r.global.metrics.scalingLatency. - WithLabelValues(logiclock.FlagsToLabels(flags)...). + WithLabelValues(revsource.FlagsToLabels(flags)...). Observe(duration.Seconds()) }) executorCore := executor.NewExecutorCore(coreExecLogger, getVmInfo(), executor.Config{ @@ -217,8 +217,9 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util Info: coreExecLogger.Info, Warn: coreExecLogger.Warn, }, + RevisionSource: revisionSource, }, - }, clock) + }) r.executorStateDump = executorCore.StateDump @@ -635,7 +636,7 @@ func doMetricsRequest( func (r *Runner) doNeonVMRequest( ctx context.Context, target api.Resources, - desiredLogicalTime *vmv1.LogicalTime, + currentRevision vmv1.RevisionWithTime, ) error { patches := []patch.Operation{{ Op: patch.OpReplace, @@ -648,7 +649,7 @@ func (r *Runner) doNeonVMRequest( }, { Op: patch.OpReplace, Path: "/spec/desiredLogicalTime", - Value: desiredLogicalTime, + Value: currentRevision, }} patchPayload, err := json.Marshal(patches) diff --git a/pkg/api/vminfo.go b/pkg/api/vminfo.go index 3ce73a4e8..6b19233e6 100644 --- a/pkg/api/vminfo.go +++ b/pkg/api/vminfo.go @@ -53,12 +53,12 @@ func HasAlwaysMigrateLabel(obj metav1.ObjectMetaAccessor) bool { // care about. It takes various labels and annotations into account, so certain fields might be // different from what's strictly in the VirtualMachine object. type VmInfo struct { - Name string `json:"name"` - Namespace string `json:"namespace"` - Cpu VmCpuInfo `json:"cpu"` - Mem VmMemInfo `json:"mem"` - Config VmConfig `json:"config"` - CurrentLogicalTime *vmapi.LogicalTime `json:"currentLogicalTime,omitempty"` + Name string `json:"name"` + Namespace string `json:"namespace"` + Cpu VmCpuInfo `json:"cpu"` + Mem VmMemInfo `json:"mem"` + Config VmConfig `json:"config"` + CurrentRevision *vmapi.RevisionWithTime `json:"currentRevision,omitempty"` } type VmCpuInfo struct { @@ -156,7 +156,7 @@ func ExtractVmInfo(logger *zap.Logger, vm *vmapi.VirtualMachine) (*VmInfo, error return nil, fmt.Errorf("error extracting VM info: %w", err) } - info.CurrentLogicalTime = vm.Status.CurrentLogicalTime + info.CurrentRevision = vm.Status.CurrentRevision return info, nil } @@ -198,7 +198,7 @@ func extractVmInfoGeneric( ScalingEnabled: scalingEnabled, ScalingConfig: nil, // set below, maybe }, - CurrentLogicalTime: nil, // set later, maybe + CurrentRevision: nil, // set later, maybe } if boundsJSON, ok := obj.GetObjectMeta().GetAnnotations()[AnnotationAutoscalingBounds]; ok { From 73de584872d959da4ca2966eea5ef12f775ff7d0 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Fri, 12 Jul 2024 22:39:59 +0400 Subject: [PATCH 26/57] revert some of the changes and fix tests Signed-off-by: Oleg Vasilev --- .../bases/vm.neon.tech_virtualmachines.yaml | 6 +- pkg/agent/core/revsource/revsource.go | 6 +- pkg/agent/core/state.go | 13 +- pkg/agent/core/state_test.go | 2308 +++++++++-------- 4 files changed, 1171 insertions(+), 1162 deletions(-) diff --git a/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml b/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml index ddade3d2b..cb3d04d3b 100644 --- a/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml +++ b/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml @@ -2445,13 +2445,13 @@ spec: process. items: properties: - Value: - default: "" - type: string name: description: Name of the environment variable. Must be a C_IDENTIFIER. type: string + value: + default: "" + type: string required: - name type: object diff --git a/pkg/agent/core/revsource/revsource.go b/pkg/agent/core/revsource/revsource.go index 537826581..38306df77 100644 --- a/pkg/agent/core/revsource/revsource.go +++ b/pkg/agent/core/revsource/revsource.go @@ -55,10 +55,10 @@ func (c *RevisionSource) nextValue() int64 { return c.offset + int64(len(c.measurements)) } -func (c *RevisionSource) Next(now time.Time) vmv1.Revision { +func (c *RevisionSource) Next(now time.Time, flags vmv1.Flag) vmv1.Revision { ret := vmv1.Revision{ Value: c.nextValue(), - Flags: 0, + Flags: flags, } c.measurements = append(c.measurements, now) return ret @@ -90,7 +90,7 @@ func (c *RevisionSource) Observe(moment time.Time, rev vmv1.Revision) error { type NilRevisionSource struct{} -func (c *NilRevisionSource) Next(_ time.Time) vmv1.Revision { +func (c *NilRevisionSource) Next(_ time.Time, _ vmv1.Flag) vmv1.Revision { return vmv1.Revision{ Value: 0, Flags: 0, diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 82c654faf..e24a9d2f8 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -214,7 +214,7 @@ func (ns *neonvmState) ongoingRequest() bool { } type RevisionSource interface { - Next(ts time.Time) vmv1.Revision + Next(ts time.Time, flags vmv1.Flag) vmv1.Revision Observe(moment time.Time, rev vmv1.Revision) error } @@ -869,17 +869,20 @@ func (s *state) updateTargetRevision( } } - s.TargetRevision = s.Config.RevisionSource.Next(now) + var flags vmv1.Flag if desired.HasFieldGreaterThan(current) { - s.TargetRevision.Flags.Set(revsource.Upscale) + flags.Set(revsource.Upscale) } if desired.HasFieldLessThan(current) { - s.TargetRevision.Flags.Set(revsource.Downscale) + flags.Set(revsource.Downscale) } if immediate { - s.TargetRevision.Flags.Set(revsource.Immediate) + flags.Set(revsource.Immediate) } + + s.TargetRevision = s.Config.RevisionSource.Next(now, flags) + } func (s *state) updateCurrentRevision(rev vmv1.RevisionWithTime) { diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 56bdf908a..b7d8b14cc 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -2,12 +2,12 @@ package core_test import ( "fmt" - "github.com/stretchr/testify/require" "testing" "time" "github.com/samber/lo" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "go.uber.org/zap" "golang.org/x/exp/slices" @@ -235,6 +235,13 @@ func duration(s string) time.Duration { return d } +func zeroRev(flag vmv1.Flag, t time.Time) vmv1.RevisionWithTime { + return vmv1.Revision{ + Value: 0, + Flags: 0, + }.WithTime(t) +} + // Thorough checks of a relatively simple flow - scaling from 1 CU to 2 CU and back down. func TestBasicScaleUpAndDownFlow(t *testing.T) { a := helpers.NewAssert(t) @@ -463,1155 +470,1154 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { }) } +// Test that in a stable state, requests to the plugin happen exactly every Config.PluginRequestTick +func TestPeriodicPluginRequest(t *testing.T) { + a := helpers.NewAssert(t) + clock := helpers.NewFakeClock(t) + + state := helpers.CreateInitialState( + DefaultInitialStateConfig, + helpers.WithStoredWarnings(a.StoredWarnings()), + ) + + state.Monitor().Active(true) + + metrics := core.SystemMetrics{ + LoadAverage1Min: 0.0, + MemoryUsageBytes: 0.0, + } + resources := DefaultComputeUnit + + a.Do(state.UpdateSystemMetrics, metrics) + + base := duration("0s") + clock.Elapsed().AssertEquals(base) + + clockTick := duration("100ms") + reqDuration := duration("50ms") + reqEvery := DefaultInitialStateConfig.Core.PluginRequestTick + endTime := duration("20s") + + doInitialPluginRequest(a, state, clock, clockTick, lo.ToPtr(metrics.ToAPI()), resources) + + for clock.Elapsed().Duration < endTime { + timeSinceScheduledRequest := (clock.Elapsed().Duration - base) % reqEvery + + if timeSinceScheduledRequest != 0 { + timeUntilNextRequest := reqEvery - timeSinceScheduledRequest + a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: timeUntilNextRequest}, + }) + clock.Inc(clockTick) + } else { + a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ + PluginRequest: &core.ActionPluginRequest{ + LastPermit: &resources, + Target: resources, + Metrics: lo.ToPtr(metrics.ToAPI()), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resources) + a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{}) + clock.Inc(reqDuration) + a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{}) + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resources, + Migrate: nil, + }) + clock.Inc(clockTick - reqDuration) + } + } +} + +// Checks that when downscaling is denied, we both (a) try again with higher resources, or (b) wait +// to retry if there aren't higher resources to try with. +func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { + a := helpers.NewAssert(t) + clock := helpers.NewFakeClock(t) + clockTickDuration := duration("0.1s") + clockTick := func() { + clock.Inc(clockTickDuration) + } + resForCU := DefaultComputeUnit.Mul + + state := helpers.CreateInitialState( + DefaultInitialStateConfig, + helpers.WithStoredWarnings(a.StoredWarnings()), + helpers.WithMinMaxCU(1, 8), + helpers.WithCurrentCU(6), // NOTE: Start at 6 CU, so we're trying to scale down immediately. + helpers.WithConfigSetting(func(c *core.Config) { + // values close to the default, so request timing works out a little better. + c.PluginRequestTick = duration("7s") + c.MonitorDeniedDownscaleCooldown = duration("4s") + }), + ) + + nextActions := func() core.ActionSet { + return state.NextActions(clock.Now()) + } + + state.Monitor().Active(true) + + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(6)) + + // Set metrics + clockTick() + metrics := core.SystemMetrics{ + LoadAverage1Min: 0.0, + MemoryUsageBytes: 0.0, + } + a.Do(state.UpdateSystemMetrics, metrics) + // double-check that we agree about the desired resources + a.Call(getDesiredResources, state, clock.Now()). + Equals(resForCU(1)) + + // Broadly the idea here is that we should be trying to request downscaling from the vm-monitor, + // and retrying with progressively higher values until either we get approved, or we run out of + // options, at which point we should wait until later to re-request downscaling. + // + // This behavior results in linear retry passes. + // + // For this test, we: + // 1. Deny any request in the first pass + // 2. Approve only down to 3 CU on the second pass + // a. triggers NeonVM request + // b. triggers plugin request + // 3. Deny all requests in the third pass (i.e. stay at 3 CU) + // 4. Approve down to 1 CU on the fourth pass + // a. triggers NeonVM request + // b. triggers plugin request + // + // ---- + // + // First pass: deny downscaling. + clock.Elapsed() + + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("6.8s")}, + MonitorDownscale: &core.ActionMonitorDownscale{ + Current: resForCU(6), + Target: resForCU(5), + TargetRevision: zeroRev(revsource.Downscale, clock.Now()), + }, + }) + a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(5)) + clockTick() + a.Do(state.Monitor().DownscaleRequestDenied, clock.Now(), zeroRev(revsource.Downscale, clock.Now())) + + // At the end, we should be waiting to retry downscaling: + a.Call(nextActions).Equals(core.ActionSet{ + // Taken from DefaultInitialStateConfig.Core.MonitorDeniedDownscaleCooldown + Wait: &core.ActionWait{Duration: duration("4.0s")}, + }) + + clock.Inc(duration("4s")) + currentPluginWait := duration("2.7s") + + // Second pass: Approve only down to 3 CU, then NeonVM & plugin requests. + for cu := uint16(5); cu >= 2; cu -= 1 { + var expectedNeonVMRequest *core.ActionNeonVMRequest + if cu < 5 { + expectedNeonVMRequest = &core.ActionNeonVMRequest{ + Current: resForCU(6), + Target: resForCU(cu + 1), + TargetRevision: zeroRev(revsource.Downscale, clock.Now()), + } + } + + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: currentPluginWait}, + MonitorDownscale: &core.ActionMonitorDownscale{ + Current: resForCU(cu + 1), + Target: resForCU(cu), + TargetRevision: zeroRev(revsource.Downscale, clock.Now()), + }, + NeonVMRequest: expectedNeonVMRequest, + }) + a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(cu)) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: currentPluginWait}, + NeonVMRequest: expectedNeonVMRequest, + }) + clockTick() + currentPluginWait -= clockTickDuration + if cu >= 3 /* allow down to 3 */ { + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), zeroRev(revsource.Downscale, clock.Now())) + } else { + a.Do(state.Monitor().DownscaleRequestDenied, clock.Now(), zeroRev(revsource.Downscale, clock.Now())) + } + } + // At this point, waiting 3.7s for next attempt to downscale below 3 CU (last request was + // successful, but the one before it wasn't), and 0.8s for plugin tick. + // Also, because downscaling was approved, we should want to make a NeonVM request to do that. + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("2.3s")}, + NeonVMRequest: &core.ActionNeonVMRequest{ + Current: resForCU(6), + Target: resForCU(3), + TargetRevision: zeroRev(revsource.Downscale, clock.Now()), + }, + }) + // Make the request: + a.Do(state.NeonVM().StartingRequest, time.Now(), resForCU(3)) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("2.3s")}, + }) + clockTick() + a.Do(state.NeonVM().RequestSuccessful, time.Now()) + // Successfully scaled down, so we should now inform the plugin. But also, we'll want to retry + // the downscale request to vm-monitor once the retry is up: + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("3.9s")}, + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(6)), + Target: resForCU(3), + Metrics: lo.ToPtr(metrics.ToAPI()), + TargetRevision: zeroRev(revsource.Downscale, clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("3.9s")}, + }) + clockTick() + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resForCU(3), + Migrate: nil, + }) + // ... And *now* there's nothing left to do but wait until downscale wait expires: + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("3.8s")}, + }) + + // so, wait for that: + clock.Inc(duration("3.8s")) + + // Third pass: deny requested downscaling. + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("3.1s")}, + MonitorDownscale: &core.ActionMonitorDownscale{ + Current: resForCU(3), + Target: resForCU(2), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(2)) + clockTick() + a.Do(state.Monitor().DownscaleRequestDenied, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now())) + // At the end, we should be waiting to retry downscaling (but actually, the regular plugin + // request is coming up sooner). + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("3.0s")}, + }) + // ... so, wait for that plugin request/response, and then wait to retry downscaling: + clock.Inc(duration("3s")) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("1s")}, // still want to retry vm-monitor downscaling + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(3)), + Target: resForCU(3), + Metrics: lo.ToPtr(metrics.ToAPI()), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("1s")}, // still waiting on retrying vm-monitor downscaling + }) + clockTick() + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resForCU(3), + Migrate: nil, + }) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("0.9s")}, // yep, still waiting on retrying vm-monitor downscaling + }) + + clock.Inc(duration("0.9s")) + + // Fourth pass: approve down to 1 CU - wait to do the NeonVM requests until the end + currentPluginWait = duration("6.0s") + for cu := uint16(2); cu >= 1; cu -= 1 { + var expectedNeonVMRequest *core.ActionNeonVMRequest + if cu < 2 { + expectedNeonVMRequest = &core.ActionNeonVMRequest{ + Current: resForCU(3), + Target: resForCU(cu + 1), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + } + } + + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: currentPluginWait}, + MonitorDownscale: &core.ActionMonitorDownscale{ + Current: resForCU(cu + 1), + Target: resForCU(cu), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + NeonVMRequest: expectedNeonVMRequest, + }) + a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(cu)) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: currentPluginWait}, + NeonVMRequest: expectedNeonVMRequest, + }) + clockTick() + currentPluginWait -= clockTickDuration + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now())) + } + // Still waiting on plugin request tick, but we can make a NeonVM request to enact the + // downscaling right away ! + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("5.8s")}, + NeonVMRequest: &core.ActionNeonVMRequest{ + Current: resForCU(3), + Target: resForCU(1), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.NeonVM().StartingRequest, time.Now(), resForCU(1)) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("5.8s")}, // yep, still waiting on the plugin + }) + clockTick() + a.Do(state.NeonVM().RequestSuccessful, time.Now()) + // Successfully downscaled, so now we should inform the plugin. Not waiting on any retries. + a.Call(nextActions).Equals(core.ActionSet{ + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(3)), + Target: resForCU(1), + Metrics: lo.ToPtr(metrics.ToAPI()), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) + a.Call(nextActions).Equals(core.ActionSet{ + // not waiting on anything! + }) + clockTick() + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resForCU(1), + Migrate: nil, + }) + // And now there's truly nothing left to do. Back to waiting on plugin request tick :) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("6.9s")}, + }) +} + +// Checks that we scale up in a timely manner when the vm-monitor requests it, and don't request +// downscaling until the time expires. +func TestRequestedUpscale(t *testing.T) { + a := helpers.NewAssert(t) + clock := helpers.NewFakeClock(t) + clockTick := func() { + clock.Inc(100 * time.Millisecond) + } + resForCU := DefaultComputeUnit.Mul + + state := helpers.CreateInitialState( + DefaultInitialStateConfig, + helpers.WithStoredWarnings(a.StoredWarnings()), + helpers.WithConfigSetting(func(c *core.Config) { + c.MonitorRequestedUpscaleValidPeriod = duration("6s") // Override this for consistency + }), + ) + nextActions := func() core.ActionSet { + return state.NextActions(clock.Now()) + } + + state.Monitor().Active(true) + + // Send initial scheduler request: + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) + + // Set metrics + clockTick() + lastMetrics := core.SystemMetrics{ + LoadAverage1Min: 0.0, + MemoryUsageBytes: 0.0, + } + a.Do(state.UpdateSystemMetrics, lastMetrics) + + // Check we're not supposed to do anything + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.8s")}, + }) + + // Have the vm-monitor request upscaling: + a.Do(state.Monitor().UpscaleRequested, clock.Now(), api.MoreResources{Cpu: false, Memory: true}) + // First need to check with the scheduler plugin to get approval for upscaling: + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("6s")}, // if nothing else happens, requested upscale expires. + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(1)), + Target: resForCU(2), + Metrics: lo.ToPtr(lastMetrics.ToAPI()), + TargetRevision: zeroRev(revsource.Upscale|revsource.Immediate, clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) + clockTick() + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("5.9s")}, // same waiting for requested upscale expiring + }) + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resForCU(2), + Migrate: nil, + }) + + // After approval from the scheduler plugin, now need to make NeonVM request: + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin tick wait is earlier than requested upscale expiration + NeonVMRequest: &core.ActionNeonVMRequest{ + Current: resForCU(1), + Target: resForCU(2), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) + clockTick() + a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + + // Finally, tell the vm-monitor that it got upscaled: + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.8s")}, // still waiting on plugin tick + MonitorUpscale: &core.ActionMonitorUpscale{ + Current: resForCU(1), + Target: resForCU(2), + TargetRevision: zeroRev(revsource.Upscale|revsource.Immediate, clock.Now()), + }, + }) + a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) + clockTick() + a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) + + // After everything, we should be waiting on both: + // (a) scheduler plugin tick (4.7s remaining), and + // (b) vm-monitor requested upscaling expiring (5.7s remaining) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.7s")}, + }) + + // Do the routine scheduler plugin request. Still waiting 1s for vm-monitor request expiration + clock.Inc(duration("4.7s")) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("1s")}, + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(2)), + Target: resForCU(2), + Metrics: lo.ToPtr(lastMetrics.ToAPI()), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) + clockTick() + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("0.9s")}, // waiting for requested upscale expiring + }) + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resForCU(2), + Migrate: nil, + }) + + // Still should just be waiting on vm-monitor upscale expiring + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("0.9s")}, + }) + clock.Inc(duration("0.9s")) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4s")}, // now, waiting on plugin request tick + MonitorDownscale: &core.ActionMonitorDownscale{ + Current: resForCU(2), + Target: resForCU(1), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) +} + +// Checks that if we get new metrics partway through downscaling, then we pivot back to upscaling +// without further requests in furtherance of downscaling. // -//// Test that in a stable state, requests to the plugin happen exactly every Config.PluginRequestTick -//func TestPeriodicPluginRequest(t *testing.T) { -// a := helpers.NewAssert(t) -// clock := helpers.NewFakeClock(t) -// -// state := helpers.CreateInitialState( -// DefaultInitialStateConfig, -// helpers.WithStoredWarnings(a.StoredWarnings()), -// ) -// -// state.Monitor().Active(true) -// -// metrics := core.SystemMetrics{ -// LoadAverage1Min: 0.0, -// MemoryUsageBytes: 0.0, -// } -// resources := DefaultComputeUnit -// -// a.Do(state.UpdateSystemMetrics, metrics) -// -// base := duration("0s") -// clock.Elapsed().AssertEquals(base) -// -// clockTick := duration("100ms") -// reqDuration := duration("50ms") -// reqEvery := DefaultInitialStateConfig.Core.PluginRequestTick -// endTime := duration("20s") -// -// doInitialPluginRequest(a, state, clock, clockTick, lo.ToPtr(metrics.ToAPI()), resources) -// -// for clock.Elapsed().Duration < endTime { -// timeSinceScheduledRequest := (clock.Elapsed().Duration - base) % reqEvery -// -// if timeSinceScheduledRequest != 0 { -// timeUntilNextRequest := reqEvery - timeSinceScheduledRequest -// a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: timeUntilNextRequest}, -// }) -// clock.Inc(clockTick) -// } else { -// a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: &resources, -// Target: resources, -// Metrics: lo.ToPtr(metrics.ToAPI()), -// TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resources) -// a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{}) -// clock.Inc(reqDuration) -// a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{}) -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resources, -// Migrate: nil, -// }) -// clock.Inc(clockTick - reqDuration) -// } -// } -//} -// -//// Checks that when downscaling is denied, we both (a) try again with higher resources, or (b) wait -//// to retry if there aren't higher resources to try with. -//func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { -// a := helpers.NewAssert(t) -// clock := helpers.NewFakeClock(t) -// clockTickDuration := duration("0.1s") -// clockTick := func() { -// clock.Inc(clockTickDuration) -// } -// resForCU := DefaultComputeUnit.Mul -// -// state := helpers.CreateInitialState( -// DefaultInitialStateConfig, -// helpers.WithStoredWarnings(a.StoredWarnings()), -// helpers.WithMinMaxCU(1, 8), -// helpers.WithCurrentCU(6), // NOTE: Start at 6 CU, so we're trying to scale down immediately. -// helpers.WithConfigSetting(func(c *core.Config) { -// // values close to the default, so request timing works out a little better. -// c.PluginRequestTick = duration("7s") -// c.MonitorDeniedDownscaleCooldown = duration("4s") -// }), -// ) -// -// nextActions := func() core.ActionSet { -// return state.NextActions(clock.Now()) -// } -// -// state.Monitor().Active(true) -// -// doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(6)) -// -// // Set metrics -// clockTick() -// metrics := core.SystemMetrics{ -// LoadAverage1Min: 0.0, -// MemoryUsageBytes: 0.0, -// } -// a.Do(state.UpdateSystemMetrics, metrics) -// // double-check that we agree about the desired resources -// a.Call(getDesiredResources, state, clock.Now()). -// Equals(resForCU(1)) -// -// // Broadly the idea here is that we should be trying to request downscaling from the vm-monitor, -// // and retrying with progressively higher values until either we get approved, or we run out of -// // options, at which point we should wait until later to re-request downscaling. -// // -// // This behavior results in linear retry passes. -// // -// // For this test, we: -// // 1. Deny any request in the first pass -// // 2. Approve only down to 3 CU on the second pass -// // a. triggers NeonVM request -// // b. triggers plugin request -// // 3. Deny all requests in the third pass (i.e. stay at 3 CU) -// // 4. Approve down to 1 CU on the fourth pass -// // a. triggers NeonVM request -// // b. triggers plugin request -// // -// // ---- -// // -// // First pass: deny downscaling. -// clock.Elapsed() -// -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("6.8s")}, -// MonitorDownscale: &core.ActionMonitorDownscale{ -// Current: resForCU(6), -// Target: resForCU(5), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(5)) -// clockTick() -// a.Do(state.Monitor().DownscaleRequestDenied, clock.Now()) -// -// // At the end, we should be waiting to retry downscaling: -// a.Call(nextActions).Equals(core.ActionSet{ -// // Taken from DefaultInitialStateConfig.Core.MonitorDeniedDownscaleCooldown -// Wait: &core.ActionWait{Duration: duration("4.0s")}, -// }) -// -// clock.Inc(duration("4s")) -// currentPluginWait := duration("2.7s") -// -// // Second pass: Approve only down to 3 CU, then NeonVM & plugin requests. -// for cu := uint16(5); cu >= 2; cu -= 1 { -// var expectedNeonVMRequest *core.ActionNeonVMRequest -// if cu < 5 { -// expectedNeonVMRequest = &core.ActionNeonVMRequest{ -// Current: resForCU(6), -// Target: resForCU(cu + 1), -// DesiredLogicalTime: nil, -// } -// } -// -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: currentPluginWait}, -// MonitorDownscale: &core.ActionMonitorDownscale{ -// Current: resForCU(cu + 1), -// Target: resForCU(cu), -// DesiredLogicalTime: nil, -// }, -// NeonVMRequest: expectedNeonVMRequest, -// }) -// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(cu)) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: currentPluginWait}, -// NeonVMRequest: expectedNeonVMRequest, -// }) -// clockTick() -// currentPluginWait -= clockTickDuration -// if cu >= 3 /* allow down to 3 */ { -// a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) -// } else { -// a.Do(state.Monitor().DownscaleRequestDenied, clock.Now()) -// } -// } -// // At this point, waiting 3.7s for next attempt to downscale below 3 CU (last request was -// // successful, but the one before it wasn't), and 0.8s for plugin tick. -// // Also, because downscaling was approved, we should want to make a NeonVM request to do that. -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("2.3s")}, -// NeonVMRequest: &core.ActionNeonVMRequest{ -// Current: resForCU(6), -// Target: resForCU(3), -// DesiredLogicalTime: nil, -// }, -// }) -// // Make the request: -// a.Do(state.NeonVM().StartingRequest, time.Now(), resForCU(3)) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("2.3s")}, -// }) -// clockTick() -// a.Do(state.NeonVM().RequestSuccessful, time.Now()) -// // Successfully scaled down, so we should now inform the plugin. But also, we'll want to retry -// // the downscale request to vm-monitor once the retry is up: -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("3.9s")}, -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: lo.ToPtr(resForCU(6)), -// Target: resForCU(3), -// Metrics: lo.ToPtr(metrics.ToAPI()), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("3.9s")}, -// }) -// clockTick() -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resForCU(3), -// Migrate: nil, -// }) -// // ... And *now* there's nothing left to do but wait until downscale wait expires: -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("3.8s")}, -// }) -// -// // so, wait for that: -// clock.Inc(duration("3.8s")) -// -// // Third pass: deny requested downscaling. -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("3.1s")}, -// MonitorDownscale: &core.ActionMonitorDownscale{ -// Current: resForCU(3), -// Target: resForCU(2), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(2)) -// clockTick() -// a.Do(state.Monitor().DownscaleRequestDenied, clock.Now()) -// // At the end, we should be waiting to retry downscaling (but actually, the regular plugin -// // request is coming up sooner). -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("3.0s")}, -// }) -// // ... so, wait for that plugin request/response, and then wait to retry downscaling: -// clock.Inc(duration("3s")) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("1s")}, // still want to retry vm-monitor downscaling -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: lo.ToPtr(resForCU(3)), -// Target: resForCU(3), -// Metrics: lo.ToPtr(metrics.ToAPI()), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("1s")}, // still waiting on retrying vm-monitor downscaling -// }) -// clockTick() -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resForCU(3), -// Migrate: nil, -// }) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("0.9s")}, // yep, still waiting on retrying vm-monitor downscaling -// }) -// -// clock.Inc(duration("0.9s")) -// -// // Fourth pass: approve down to 1 CU - wait to do the NeonVM requests until the end -// currentPluginWait = duration("6.0s") -// for cu := uint16(2); cu >= 1; cu -= 1 { -// var expectedNeonVMRequest *core.ActionNeonVMRequest -// if cu < 2 { -// expectedNeonVMRequest = &core.ActionNeonVMRequest{ -// Current: resForCU(3), -// Target: resForCU(cu + 1), -// DesiredLogicalTime: nil, -// } -// } -// -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: currentPluginWait}, -// MonitorDownscale: &core.ActionMonitorDownscale{ -// Current: resForCU(cu + 1), -// Target: resForCU(cu), -// DesiredLogicalTime: nil, -// }, -// NeonVMRequest: expectedNeonVMRequest, -// }) -// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(cu)) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: currentPluginWait}, -// NeonVMRequest: expectedNeonVMRequest, -// }) -// clockTick() -// currentPluginWait -= clockTickDuration -// a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) -// } -// // Still waiting on plugin request tick, but we can make a NeonVM request to enact the -// // downscaling right away ! -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("5.8s")}, -// NeonVMRequest: &core.ActionNeonVMRequest{ -// Current: resForCU(3), -// Target: resForCU(1), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.NeonVM().StartingRequest, time.Now(), resForCU(1)) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("5.8s")}, // yep, still waiting on the plugin -// }) -// clockTick() -// a.Do(state.NeonVM().RequestSuccessful, time.Now()) -// // Successfully downscaled, so now we should inform the plugin. Not waiting on any retries. -// a.Call(nextActions).Equals(core.ActionSet{ -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: lo.ToPtr(resForCU(3)), -// Target: resForCU(1), -// Metrics: lo.ToPtr(metrics.ToAPI()), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) -// a.Call(nextActions).Equals(core.ActionSet{ -// // not waiting on anything! -// }) -// clockTick() -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resForCU(1), -// Migrate: nil, -// }) -// // And now there's truly nothing left to do. Back to waiting on plugin request tick :) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("6.9s")}, -// }) -//} -// -//// Checks that we scale up in a timely manner when the vm-monitor requests it, and don't request -//// downscaling until the time expires. -//func TestRequestedUpscale(t *testing.T) { -// a := helpers.NewAssert(t) -// clock := helpers.NewFakeClock(t) -// clockTick := func() { -// clock.Inc(100 * time.Millisecond) -// } -// resForCU := DefaultComputeUnit.Mul -// -// state := helpers.CreateInitialState( -// DefaultInitialStateConfig, -// helpers.WithStoredWarnings(a.StoredWarnings()), -// helpers.WithConfigSetting(func(c *core.Config) { -// c.MonitorRequestedUpscaleValidPeriod = duration("6s") // Override this for consistency -// }), -// ) -// nextActions := func() core.ActionSet { -// return state.NextActions(clock.Now()) -// } -// -// state.Monitor().Active(true) -// -// // Send initial scheduler request: -// doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) -// -// // Set metrics -// clockTick() -// lastMetrics := core.SystemMetrics{ -// LoadAverage1Min: 0.0, -// MemoryUsageBytes: 0.0, -// } -// a.Do(state.UpdateSystemMetrics, lastMetrics) -// -// // Check we're not supposed to do anything -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.8s")}, -// }) -// -// // Have the vm-monitor request upscaling: -// a.Do(state.Monitor().UpscaleRequested, clock.Now(), api.MoreResources{Cpu: false, Memory: true}) -// // First need to check with the scheduler plugin to get approval for upscaling: -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("6s")}, // if nothing else happens, requested upscale expires. -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: lo.ToPtr(resForCU(1)), -// Target: resForCU(2), -// Metrics: lo.ToPtr(lastMetrics.ToAPI()), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) -// clockTick() -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("5.9s")}, // same waiting for requested upscale expiring -// }) -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resForCU(2), -// Migrate: nil, -// }) -// -// // After approval from the scheduler plugin, now need to make NeonVM request: -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin tick wait is earlier than requested upscale expiration -// NeonVMRequest: &core.ActionNeonVMRequest{ -// Current: resForCU(1), -// Target: resForCU(2), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) -// clockTick() -// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) -// -// // Finally, tell the vm-monitor that it got upscaled: -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.8s")}, // still waiting on plugin tick -// MonitorUpscale: &core.ActionMonitorUpscale{ -// Current: resForCU(1), -// Target: resForCU(2), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) -// clockTick() -// a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) -// -// // After everything, we should be waiting on both: -// // (a) scheduler plugin tick (4.7s remaining), and -// // (b) vm-monitor requested upscaling expiring (5.7s remaining) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.7s")}, -// }) -// -// // Do the routine scheduler plugin request. Still waiting 1s for vm-monitor request expiration -// clock.Inc(duration("4.7s")) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("1s")}, -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: lo.ToPtr(resForCU(2)), -// Target: resForCU(2), -// Metrics: lo.ToPtr(lastMetrics.ToAPI()), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) -// clockTick() -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("0.9s")}, // waiting for requested upscale expiring -// }) -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resForCU(2), -// Migrate: nil, -// }) -// -// // Still should just be waiting on vm-monitor upscale expiring -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("0.9s")}, -// }) -// clock.Inc(duration("0.9s")) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4s")}, // now, waiting on plugin request tick -// MonitorDownscale: &core.ActionMonitorDownscale{ -// Current: resForCU(2), -// Target: resForCU(1), -// DesiredLogicalTime: nil, -// }, -// }) -//} -// -//// Checks that if we get new metrics partway through downscaling, then we pivot back to upscaling -//// without further requests in furtherance of downscaling. -//// -//// For example, if we pivot during the NeonVM request to do the downscaling, then the request to to -//// the scheduler plugin should never be made, because we decided against downscaling. -//func TestDownscalePivotBack(t *testing.T) { -// a := helpers.NewAssert(t) -// var clock *helpers.FakeClock -// -// clockTickDuration := duration("0.1s") -// clockTick := func() helpers.Elapsed { -// return clock.Inc(clockTickDuration) -// } -// halfClockTick := func() helpers.Elapsed { -// return clock.Inc(clockTickDuration / 2) -// } -// resForCU := DefaultComputeUnit.Mul -// -// var state *core.State -// nextActions := func() core.ActionSet { -// return state.NextActions(clock.Now()) -// } -// -// initialMetrics := core.SystemMetrics{ -// LoadAverage1Min: 0.0, -// MemoryUsageBytes: 0.0, -// } -// newMetrics := core.SystemMetrics{ -// LoadAverage1Min: 0.3, -// MemoryUsageBytes: 0.0, -// } -// -// steps := []struct { -// pre func(pluginWait *time.Duration, midRequest func()) -// post func(pluginWait *time.Duration) -// }{ -// // vm-monitor requests: -// { -// pre: func(pluginWait *time.Duration, midRequest func()) { -// t.Log(" > start vm-monitor downscale") -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: *pluginWait}, -// MonitorDownscale: &core.ActionMonitorDownscale{ -// Current: resForCU(2), -// Target: resForCU(1), -// -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) -// halfClockTick() -// midRequest() -// halfClockTick() -// *pluginWait -= clockTickDuration -// t.Log(" > finish vm-monitor downscale") -// a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) -// }, -// post: func(pluginWait *time.Duration) { -// t.Log(" > start vm-monitor upscale") -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: *pluginWait}, -// MonitorUpscale: &core.ActionMonitorUpscale{ -// Current: resForCU(1), -// Target: resForCU(2), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) -// clockTick() -// *pluginWait -= clockTickDuration -// t.Log(" > finish vm-monitor upscale") -// a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) -// }, -// }, -// // NeonVM requests -// { -// pre: func(pluginWait *time.Duration, midRequest func()) { -// t.Log(" > start NeonVM downscale") -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: *pluginWait}, -// NeonVMRequest: &core.ActionNeonVMRequest{ -// Current: resForCU(2), -// Target: resForCU(1), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) -// halfClockTick() -// midRequest() -// halfClockTick() -// *pluginWait -= clockTickDuration -// t.Log(" > finish NeonVM downscale") -// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) -// }, -// post: func(pluginWait *time.Duration) { -// t.Log(" > start NeonVM upscale") -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: *pluginWait}, -// NeonVMRequest: &core.ActionNeonVMRequest{ -// Current: resForCU(1), -// Target: resForCU(2), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) -// clockTick() -// *pluginWait -= clockTickDuration -// t.Log(" > finish NeonVM upscale") -// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) -// }, -// }, -// // plugin requests -// { -// pre: func(pluginWait *time.Duration, midRequest func()) { -// t.Log(" > start plugin downscale") -// a.Call(nextActions).Equals(core.ActionSet{ -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: lo.ToPtr(resForCU(2)), -// Target: resForCU(1), -// Metrics: lo.ToPtr(initialMetrics.ToAPI()), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) -// halfClockTick() -// midRequest() -// halfClockTick() -// *pluginWait = duration("4.9s") // reset because we just made a request -// t.Log(" > finish plugin downscale") -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resForCU(1), -// Migrate: nil, -// }) -// }, -// post: func(pluginWait *time.Duration) { -// t.Log(" > start plugin upscale") -// a.Call(nextActions).Equals(core.ActionSet{ -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: lo.ToPtr(resForCU(1)), -// Target: resForCU(2), -// Metrics: lo.ToPtr(newMetrics.ToAPI()), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) -// clockTick() -// *pluginWait = duration("4.9s") // reset because we just made a request -// t.Log(" > finish plugin upscale") -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resForCU(2), -// Migrate: nil, -// }) -// }, -// }, -// } -// -// for i := 0; i < len(steps); i++ { -// t.Logf("iter(%d)", i) -// -// // Initial setup -// clock = helpers.NewFakeClock(t) -// state = helpers.CreateInitialState( -// DefaultInitialStateConfig, -// helpers.WithStoredWarnings(a.StoredWarnings()), -// helpers.WithMinMaxCU(1, 3), -// helpers.WithCurrentCU(2), -// ) -// -// state.Monitor().Active(true) -// -// doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) -// -// clockTick().AssertEquals(duration("0.2s")) -// pluginWait := duration("4.8s") -// -// a.Do(state.UpdateSystemMetrics, initialMetrics) -// // double-check that we agree about the desired resources -// a.Call(getDesiredResources, state, clock.Now()). -// Equals(resForCU(1)) -// -// for j := 0; j <= i; j++ { -// midRequest := func() {} -// if j == i { -// // at the midpoint, start backtracking by setting the metrics -// midRequest = func() { -// t.Log(" > > updating metrics mid-request") -// a.Do(state.UpdateSystemMetrics, newMetrics) -// a.Call(getDesiredResources, state, clock.Now()). -// Equals(resForCU(2)) -// } -// } -// -// steps[j].pre(&pluginWait, midRequest) -// } -// -// for j := i; j >= 0; j-- { -// steps[j].post(&pluginWait) -// } -// } -//} -// -//// Checks that if the VM's min/max bounds change so that the maximum is below the current and -//// desired usage, we try to downscale -//func TestBoundsChangeRequiresDownsale(t *testing.T) { -// a := helpers.NewAssert(t) -// clock := helpers.NewFakeClock(t) -// clockTick := func() { -// clock.Inc(100 * time.Millisecond) -// } -// resForCU := DefaultComputeUnit.Mul -// -// state := helpers.CreateInitialState( -// DefaultInitialStateConfig, -// helpers.WithStoredWarnings(a.StoredWarnings()), -// helpers.WithMinMaxCU(1, 3), -// helpers.WithCurrentCU(2), -// ) -// nextActions := func() core.ActionSet { -// return state.NextActions(clock.Now()) -// } -// -// state.Monitor().Active(true) -// -// // Send initial scheduler request: -// doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) -// -// clockTick() -// -// // Set metrics so the desired resources are still 2 CU -// metrics := core.SystemMetrics{ -// LoadAverage1Min: 0.3, -// MemoryUsageBytes: 0.0, -// } -// a.Do(state.UpdateSystemMetrics, metrics) -// // Check that we agree about desired resources -// a.Call(getDesiredResources, state, clock.Now()). -// Equals(resForCU(2)) -// // Check we've got nothing to do yet -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.8s")}, -// }) -// -// clockTick() -// -// // Update the VM to set min=max=1 CU -// a.Do(state.UpdatedVM, helpers.CreateVmInfo( -// DefaultInitialStateConfig.VM, -// helpers.WithCurrentCU(2), -// helpers.WithMinMaxCU(1, 1), -// )) -// -// // We should be making a vm-monitor downscaling request -// // TODO: In the future, we should have a "force-downscale" alternative so the vm-monitor doesn't -// // get to deny the downscaling. -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.7s")}, -// MonitorDownscale: &core.ActionMonitorDownscale{ -// Current: resForCU(2), -// Target: resForCU(1), -// -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) -// clockTick() -// a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) -// // Do NeonVM request for that downscaling -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.6s")}, -// NeonVMRequest: &core.ActionNeonVMRequest{ -// Current: resForCU(2), -// Target: resForCU(1), -// -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) -// clockTick() -// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) -// // Do plugin request for that downscaling: -// a.Call(nextActions).Equals(core.ActionSet{ -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: lo.ToPtr(resForCU(2)), -// Target: resForCU(1), -// Metrics: lo.ToPtr(metrics.ToAPI()), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) -// clockTick() -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resForCU(1), -// Migrate: nil, -// }) -// // And then, we shouldn't need to do anything else: -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.9s")}, -// }) -//} -// -//// Checks that if the VM's min/max bounds change so that the minimum is above the current and -//// desired usage, we try to upscale -//func TestBoundsChangeRequiresUpscale(t *testing.T) { -// a := helpers.NewAssert(t) -// clock := helpers.NewFakeClock(t) -// clockTick := func() { -// clock.Inc(100 * time.Millisecond) -// } -// resForCU := DefaultComputeUnit.Mul -// -// state := helpers.CreateInitialState( -// DefaultInitialStateConfig, -// helpers.WithStoredWarnings(a.StoredWarnings()), -// helpers.WithMinMaxCU(1, 3), -// helpers.WithCurrentCU(2), -// ) -// nextActions := func() core.ActionSet { -// return state.NextActions(clock.Now()) -// } -// -// state.Monitor().Active(true) -// -// // Send initial scheduler request: -// doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) -// -// clockTick() -// -// // Set metrics so the desired resources are still 2 CU -// metrics := core.SystemMetrics{ -// LoadAverage1Min: 0.3, -// MemoryUsageBytes: 0.0, -// } -// a.Do(state.UpdateSystemMetrics, metrics) -// // Check that we agree about desired resources -// a.Call(getDesiredResources, state, clock.Now()). -// Equals(resForCU(2)) -// // Check we've got nothing to do yet -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.8s")}, -// }) -// -// clockTick() -// -// // Update the VM to set min=max=3 CU -// a.Do(state.UpdatedVM, helpers.CreateVmInfo( -// DefaultInitialStateConfig.VM, -// helpers.WithCurrentCU(2), -// helpers.WithMinMaxCU(3, 3), -// )) -// -// // We should be making a plugin request to get upscaling: -// a.Call(nextActions).Equals(core.ActionSet{ -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: lo.ToPtr(resForCU(2)), -// Target: resForCU(3), -// Metrics: lo.ToPtr(metrics.ToAPI()), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) -// clockTick() -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resForCU(3), -// Migrate: nil, -// }) -// // Do NeonVM request for the upscaling -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.9s")}, -// NeonVMRequest: &core.ActionNeonVMRequest{ -// Current: resForCU(2), -// Target: resForCU(3), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(3)) -// clockTick() -// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) -// // Do vm-monitor upscale request -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.8s")}, -// MonitorUpscale: &core.ActionMonitorUpscale{ -// Current: resForCU(2), -// Target: resForCU(3), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(3)) -// clockTick() -// a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) -// // And then, we shouldn't need to do anything else: -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.7s")}, -// }) -//} -// -//// Checks that failed requests to the scheduler plugin and NeonVM API will be retried after a delay -//func TestFailedRequestRetry(t *testing.T) { -// a := helpers.NewAssert(t) -// clock := helpers.NewFakeClock(t) -// clockTick := func() { -// clock.Inc(100 * time.Millisecond) -// } -// resForCU := DefaultComputeUnit.Mul -// -// state := helpers.CreateInitialState( -// DefaultInitialStateConfig, -// helpers.WithStoredWarnings(a.StoredWarnings()), -// helpers.WithMinMaxCU(1, 2), -// helpers.WithCurrentCU(1), -// helpers.WithConfigSetting(func(c *core.Config) { -// // Override values for consistency and ease of use -// c.PluginRetryWait = duration("2s") -// c.NeonVMRetryWait = duration("3s") -// }), -// ) -// nextActions := func() core.ActionSet { -// return state.NextActions(clock.Now()) -// } -// -// state.Monitor().Active(true) -// -// // Send initial scheduler request -// doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) -// -// // Set metrics so that we should be trying to upscale -// clockTick() -// metrics := core.SystemMetrics{ -// LoadAverage1Min: 0.3, -// MemoryUsageBytes: 0.0, -// } -// a.Do(state.UpdateSystemMetrics, metrics) -// -// // We should be asking the scheduler for upscaling -// a.Call(nextActions).Equals(core.ActionSet{ -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: lo.ToPtr(resForCU(1)), -// Target: resForCU(2), -// Metrics: lo.ToPtr(metrics.ToAPI()), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) -// clockTick() -// // On request failure, we retry after Config.PluginRetryWait -// a.Do(state.Plugin().RequestFailed, clock.Now()) -// a. -// WithWarnings("Wanted to make a request to the scheduler plugin, but previous request failed too recently"). -// Call(nextActions). -// Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("2s")}, -// }) -// clock.Inc(duration("2s")) -// // ... and then retry: -// a.Call(nextActions).Equals(core.ActionSet{ -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: lo.ToPtr(resForCU(1)), -// Target: resForCU(2), -// Metrics: lo.ToPtr(metrics.ToAPI()), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) -// clockTick() -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resForCU(2), -// Migrate: nil, -// }) -// -// // Now, after plugin request is successful, we should be making a request to NeonVM. -// // We'll have that request fail the first time as well: -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin request tick -// NeonVMRequest: &core.ActionNeonVMRequest{ -// Current: resForCU(1), -// Target: resForCU(2), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) -// clockTick() -// // On request failure, we retry after Config.NeonVMRetryWait -// a.Do(state.NeonVM().RequestFailed, clock.Now()) -// a. -// WithWarnings("Wanted to make a request to NeonVM API, but recent request failed too recently"). -// Call(nextActions). -// Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("3s")}, // NeonVM retry wait is less than current plugin request tick (4.8s remaining) -// }) -// clock.Inc(duration("3s")) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("1.8s")}, // plugin request tick -// NeonVMRequest: &core.ActionNeonVMRequest{ -// Current: resForCU(1), -// Target: resForCU(2), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) -// clockTick() -// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) -// -// // And then finally, we should be looking to inform the vm-monitor about this upscaling. -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("1.7s")}, // plugin request tick -// MonitorUpscale: &core.ActionMonitorUpscale{ -// Current: resForCU(1), -// Target: resForCU(2), -// DesiredLogicalTime: nil, -// }, -// }) -//} -// -//// Checks that when metrics are updated during the downscaling process, between the NeonVM request -//// and plugin request, we keep those processes mostly separate, without interference between them. -//// -//// This is distilled from a bug found on staging that resulted in faulty requests to the plugin. -//func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { -// a := helpers.NewAssert(t) -// clock := helpers.NewFakeClock(t) -// clockTick := func() { -// clock.Inc(100 * time.Millisecond) -// } -// resForCU := DefaultComputeUnit.Mul -// -// state := helpers.CreateInitialState( -// DefaultInitialStateConfig, -// helpers.WithStoredWarnings(a.StoredWarnings()), -// // NOTE: current CU is greater than max CU. This is in line with what happens when -// // unassigned pooled VMs created by the control plane are first assigned and endpoint and -// // must immediately scale down. -// helpers.WithMinMaxCU(1, 2), -// helpers.WithCurrentCU(3), -// ) -// nextActions := func() core.ActionSet { -// return state.NextActions(clock.Now()) -// } -// -// // Send initial scheduler request - without the monitor active, so we're stuck at 4 CU for now -// a. -// WithWarnings("Wanted to send vm-monitor downscale request, but there's no active connection"). -// Call(state.NextActions, clock.Now()). -// Equals(core.ActionSet{ -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: nil, -// Target: resForCU(3), -// Metrics: nil, -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) -// clockTick() -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resForCU(3), -// Migrate: nil, -// }) -// -// clockTick() -// -// // Monitor's now active, so we should be asking it for downscaling. -// // We don't yet have metrics though, so we only want to downscale as much as is required. -// state.Monitor().Active(true) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.8s")}, -// MonitorDownscale: &core.ActionMonitorDownscale{ -// Current: resForCU(3), -// Target: resForCU(2), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(2)) -// -// // In the middle of the vm-monitor request, update the metrics so that now the desired resource -// // usage is actually 1 CU -// clockTick() -// // the actual metrics we got in the actual logs -// metrics := core.SystemMetrics{ -// LoadAverage1Min: 0.0, -// MemoryUsageBytes: 150589570, // 143.6 MiB -// } -// a.Do(state.UpdateSystemMetrics, metrics) -// -// // nothing to do yet, until the existing vm-monitor request finishes -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.7s")}, // plugin request tick wait -// }) -// -// clockTick() -// -// // When the vm-monitor request finishes, we want to both -// // (a) request additional downscaling from vm-monitor, and -// // (b) make a NeonVM request for the initially approved downscaling -// a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.6s")}, // plugin request tick wait -// NeonVMRequest: &core.ActionNeonVMRequest{ -// Current: resForCU(3), -// Target: resForCU(2), -// DesiredLogicalTime: nil, -// }, -// MonitorDownscale: &core.ActionMonitorDownscale{ -// Current: resForCU(2), -// Target: resForCU(1), -// DesiredLogicalTime: nil, -// }, -// }) -// // Start both requests. The vm-monitor request will finish first, but after that we'll just be -// // waiting on the NeonVM request (and then redoing a follow-up for more downscaling). -// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) -// a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) -// -// clockTick() -// -// a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now()) -// a. -// WithWarnings( -// "Wanted to make a request to NeonVM API, but there's already NeonVM request (for different resources) ongoing", -// ). -// Call(nextActions). -// Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.5s")}, // plugin request tick wait -// }) -// -// clockTick() -// -// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) -// state.Debug(true) -// a. -// Call(nextActions). -// Equals(core.ActionSet{ -// // At this point in the original logs from staging, the intended request to the plugin was -// // incorrectly for 1 CU, rather than 2 CU. So, the rest of this test case is mostly just -// // rounding out the rest of the scale-down routine. -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: lo.ToPtr(resForCU(3)), -// Target: resForCU(2), -// Metrics: lo.ToPtr(metrics.ToAPI()), -// DesiredLogicalTime: nil, -// }, -// NeonVMRequest: &core.ActionNeonVMRequest{ -// Current: resForCU(2), -// Target: resForCU(1), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) -// a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) -// -// clockTick() -// -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resForCU(2), -// Migrate: nil, -// }) -// // Still waiting for NeonVM request to complete -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin request tick wait -// }) -// -// clockTick() -// -// // After the NeonVM request finishes, all that we have left to do is inform the plugin of the -// // final downscaling. -// a.Do(state.NeonVM().RequestSuccessful, clock.Now()) -// a.Call(nextActions).Equals(core.ActionSet{ -// PluginRequest: &core.ActionPluginRequest{ -// LastPermit: lo.ToPtr(resForCU(2)), -// Target: resForCU(1), -// Metrics: lo.ToPtr(metrics.ToAPI()), -// DesiredLogicalTime: nil, -// }, -// }) -// a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) -// -// clockTick() +// For example, if we pivot during the NeonVM request to do the downscaling, then the request to to +// the scheduler plugin should never be made, because we decided against downscaling. +func TestDownscalePivotBack(t *testing.T) { + a := helpers.NewAssert(t) + var clock *helpers.FakeClock + + clockTickDuration := duration("0.1s") + clockTick := func() helpers.Elapsed { + return clock.Inc(clockTickDuration) + } + halfClockTick := func() helpers.Elapsed { + return clock.Inc(clockTickDuration / 2) + } + resForCU := DefaultComputeUnit.Mul + + var state *core.State + nextActions := func() core.ActionSet { + return state.NextActions(clock.Now()) + } + + initialMetrics := core.SystemMetrics{ + LoadAverage1Min: 0.0, + MemoryUsageBytes: 0.0, + } + newMetrics := core.SystemMetrics{ + LoadAverage1Min: 0.3, + MemoryUsageBytes: 0.0, + } + + steps := []struct { + pre func(pluginWait *time.Duration, midRequest func()) + post func(pluginWait *time.Duration) + }{ + // vm-monitor requests: + { + pre: func(pluginWait *time.Duration, midRequest func()) { + t.Log(" > start vm-monitor downscale") + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: *pluginWait}, + MonitorDownscale: &core.ActionMonitorDownscale{ + Current: resForCU(2), + Target: resForCU(1), + + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) + halfClockTick() + midRequest() + halfClockTick() + *pluginWait -= clockTickDuration + t.Log(" > finish vm-monitor downscale") + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now())) + }, + post: func(pluginWait *time.Duration) { + t.Log(" > start vm-monitor upscale") + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: *pluginWait}, + MonitorUpscale: &core.ActionMonitorUpscale{ + Current: resForCU(1), + Target: resForCU(2), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) + clockTick() + *pluginWait -= clockTickDuration + t.Log(" > finish vm-monitor upscale") + a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) + }, + }, + // NeonVM requests + { + pre: func(pluginWait *time.Duration, midRequest func()) { + t.Log(" > start NeonVM downscale") + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: *pluginWait}, + NeonVMRequest: &core.ActionNeonVMRequest{ + Current: resForCU(2), + Target: resForCU(1), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) + halfClockTick() + midRequest() + halfClockTick() + *pluginWait -= clockTickDuration + t.Log(" > finish NeonVM downscale") + a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + }, + post: func(pluginWait *time.Duration) { + t.Log(" > start NeonVM upscale") + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: *pluginWait}, + NeonVMRequest: &core.ActionNeonVMRequest{ + Current: resForCU(1), + Target: resForCU(2), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) + clockTick() + *pluginWait -= clockTickDuration + t.Log(" > finish NeonVM upscale") + a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + }, + }, + // plugin requests + { + pre: func(pluginWait *time.Duration, midRequest func()) { + t.Log(" > start plugin downscale") + a.Call(nextActions).Equals(core.ActionSet{ + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(2)), + Target: resForCU(1), + Metrics: lo.ToPtr(initialMetrics.ToAPI()), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) + halfClockTick() + midRequest() + halfClockTick() + *pluginWait = duration("4.9s") // reset because we just made a request + t.Log(" > finish plugin downscale") + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resForCU(1), + Migrate: nil, + }) + }, + post: func(pluginWait *time.Duration) { + t.Log(" > start plugin upscale") + a.Call(nextActions).Equals(core.ActionSet{ + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(1)), + Target: resForCU(2), + Metrics: lo.ToPtr(newMetrics.ToAPI()), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) + clockTick() + *pluginWait = duration("4.9s") // reset because we just made a request + t.Log(" > finish plugin upscale") + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resForCU(2), + Migrate: nil, + }) + }, + }, + } + + for i := 0; i < len(steps); i++ { + t.Logf("iter(%d)", i) + + // Initial setup + clock = helpers.NewFakeClock(t) + state = helpers.CreateInitialState( + DefaultInitialStateConfig, + helpers.WithStoredWarnings(a.StoredWarnings()), + helpers.WithMinMaxCU(1, 3), + helpers.WithCurrentCU(2), + ) + + state.Monitor().Active(true) + + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) + + clockTick().AssertEquals(duration("0.2s")) + pluginWait := duration("4.8s") + + a.Do(state.UpdateSystemMetrics, initialMetrics) + // double-check that we agree about the desired resources + a.Call(getDesiredResources, state, clock.Now()). + Equals(resForCU(1)) + + for j := 0; j <= i; j++ { + midRequest := func() {} + if j == i { + // at the midpoint, start backtracking by setting the metrics + midRequest = func() { + t.Log(" > > updating metrics mid-request") + a.Do(state.UpdateSystemMetrics, newMetrics) + a.Call(getDesiredResources, state, clock.Now()). + Equals(resForCU(2)) + } + } + + steps[j].pre(&pluginWait, midRequest) + } + + for j := i; j >= 0; j-- { + steps[j].post(&pluginWait) + } + } +} + +// Checks that if the VM's min/max bounds change so that the maximum is below the current and +// desired usage, we try to downscale +func TestBoundsChangeRequiresDownsale(t *testing.T) { + a := helpers.NewAssert(t) + clock := helpers.NewFakeClock(t) + clockTick := func() { + clock.Inc(100 * time.Millisecond) + } + resForCU := DefaultComputeUnit.Mul + + state := helpers.CreateInitialState( + DefaultInitialStateConfig, + helpers.WithStoredWarnings(a.StoredWarnings()), + helpers.WithMinMaxCU(1, 3), + helpers.WithCurrentCU(2), + ) + nextActions := func() core.ActionSet { + return state.NextActions(clock.Now()) + } + + state.Monitor().Active(true) + + // Send initial scheduler request: + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) + + clockTick() + + // Set metrics so the desired resources are still 2 CU + metrics := core.SystemMetrics{ + LoadAverage1Min: 0.3, + MemoryUsageBytes: 0.0, + } + a.Do(state.UpdateSystemMetrics, metrics) + // Check that we agree about desired resources + a.Call(getDesiredResources, state, clock.Now()). + Equals(resForCU(2)) + // Check we've got nothing to do yet + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.8s")}, + }) + + clockTick() + + // Update the VM to set min=max=1 CU + a.Do(state.UpdatedVM, helpers.CreateVmInfo( + DefaultInitialStateConfig.VM, + helpers.WithCurrentCU(2), + helpers.WithMinMaxCU(1, 1), + )) + + // We should be making a vm-monitor downscaling request + // TODO: In the future, we should have a "force-downscale" alternative so the vm-monitor doesn't + // get to deny the downscaling. + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.7s")}, + MonitorDownscale: &core.ActionMonitorDownscale{ + Current: resForCU(2), + Target: resForCU(1), + + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) + clockTick() + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now())) + // Do NeonVM request for that downscaling + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.6s")}, + NeonVMRequest: &core.ActionNeonVMRequest{ + Current: resForCU(2), + Target: resForCU(1), + + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) + clockTick() + a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + // Do plugin request for that downscaling: + a.Call(nextActions).Equals(core.ActionSet{ + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(2)), + Target: resForCU(1), + Metrics: lo.ToPtr(metrics.ToAPI()), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) + clockTick() + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resForCU(1), + Migrate: nil, + }) + // And then, we shouldn't need to do anything else: + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.9s")}, + }) +} + +// Checks that if the VM's min/max bounds change so that the minimum is above the current and +// desired usage, we try to upscale +func TestBoundsChangeRequiresUpscale(t *testing.T) { + a := helpers.NewAssert(t) + clock := helpers.NewFakeClock(t) + clockTick := func() { + clock.Inc(100 * time.Millisecond) + } + resForCU := DefaultComputeUnit.Mul + + state := helpers.CreateInitialState( + DefaultInitialStateConfig, + helpers.WithStoredWarnings(a.StoredWarnings()), + helpers.WithMinMaxCU(1, 3), + helpers.WithCurrentCU(2), + ) + nextActions := func() core.ActionSet { + return state.NextActions(clock.Now()) + } + + state.Monitor().Active(true) + + // Send initial scheduler request: + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(2)) + + clockTick() + + // Set metrics so the desired resources are still 2 CU + metrics := core.SystemMetrics{ + LoadAverage1Min: 0.3, + MemoryUsageBytes: 0.0, + } + a.Do(state.UpdateSystemMetrics, metrics) + // Check that we agree about desired resources + a.Call(getDesiredResources, state, clock.Now()). + Equals(resForCU(2)) + // Check we've got nothing to do yet + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.8s")}, + }) + + clockTick() + + // Update the VM to set min=max=3 CU + a.Do(state.UpdatedVM, helpers.CreateVmInfo( + DefaultInitialStateConfig.VM, + helpers.WithCurrentCU(2), + helpers.WithMinMaxCU(3, 3), + )) + + // We should be making a plugin request to get upscaling: + a.Call(nextActions).Equals(core.ActionSet{ + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(2)), + Target: resForCU(3), + Metrics: lo.ToPtr(metrics.ToAPI()), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) + clockTick() + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resForCU(3), + Migrate: nil, + }) + // Do NeonVM request for the upscaling + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.9s")}, + NeonVMRequest: &core.ActionNeonVMRequest{ + Current: resForCU(2), + Target: resForCU(3), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(3)) + clockTick() + a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + // Do vm-monitor upscale request + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.8s")}, + MonitorUpscale: &core.ActionMonitorUpscale{ + Current: resForCU(2), + Target: resForCU(3), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(3)) + clockTick() + a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) + // And then, we shouldn't need to do anything else: + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.7s")}, + }) +} + +// Checks that failed requests to the scheduler plugin and NeonVM API will be retried after a delay +func TestFailedRequestRetry(t *testing.T) { + a := helpers.NewAssert(t) + clock := helpers.NewFakeClock(t) + clockTick := func() { + clock.Inc(100 * time.Millisecond) + } + resForCU := DefaultComputeUnit.Mul + + state := helpers.CreateInitialState( + DefaultInitialStateConfig, + helpers.WithStoredWarnings(a.StoredWarnings()), + helpers.WithMinMaxCU(1, 2), + helpers.WithCurrentCU(1), + helpers.WithConfigSetting(func(c *core.Config) { + // Override values for consistency and ease of use + c.PluginRetryWait = duration("2s") + c.NeonVMRetryWait = duration("3s") + }), + ) + nextActions := func() core.ActionSet { + return state.NextActions(clock.Now()) + } + + state.Monitor().Active(true) + + // Send initial scheduler request + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) + + // Set metrics so that we should be trying to upscale + clockTick() + metrics := core.SystemMetrics{ + LoadAverage1Min: 0.3, + MemoryUsageBytes: 0.0, + } + a.Do(state.UpdateSystemMetrics, metrics) + + // We should be asking the scheduler for upscaling + a.Call(nextActions).Equals(core.ActionSet{ + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(1)), + Target: resForCU(2), + Metrics: lo.ToPtr(metrics.ToAPI()), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) + clockTick() + // On request failure, we retry after Config.PluginRetryWait + a.Do(state.Plugin().RequestFailed, clock.Now()) + a. + WithWarnings("Wanted to make a request to the scheduler plugin, but previous request failed too recently"). + Call(nextActions). + Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("2s")}, + }) + clock.Inc(duration("2s")) + // ... and then retry: + a.Call(nextActions).Equals(core.ActionSet{ + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(1)), + Target: resForCU(2), + Metrics: lo.ToPtr(metrics.ToAPI()), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) + clockTick() + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resForCU(2), + Migrate: nil, + }) + + // Now, after plugin request is successful, we should be making a request to NeonVM. + // We'll have that request fail the first time as well: + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin request tick + NeonVMRequest: &core.ActionNeonVMRequest{ + Current: resForCU(1), + Target: resForCU(2), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) + clockTick() + // On request failure, we retry after Config.NeonVMRetryWait + a.Do(state.NeonVM().RequestFailed, clock.Now()) + a. + WithWarnings("Wanted to make a request to NeonVM API, but recent request failed too recently"). + Call(nextActions). + Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("3s")}, // NeonVM retry wait is less than current plugin request tick (4.8s remaining) + }) + clock.Inc(duration("3s")) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("1.8s")}, // plugin request tick + NeonVMRequest: &core.ActionNeonVMRequest{ + Current: resForCU(1), + Target: resForCU(2), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) + clockTick() + a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + + // And then finally, we should be looking to inform the vm-monitor about this upscaling. + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("1.7s")}, // plugin request tick + MonitorUpscale: &core.ActionMonitorUpscale{ + Current: resForCU(1), + Target: resForCU(2), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) +} + +// Checks that when metrics are updated during the downscaling process, between the NeonVM request +// and plugin request, we keep those processes mostly separate, without interference between them. // -// a.NoError(state.Plugin().RequestSuccessful, clock.Now(), NilLogicalTime, api.PluginResponse{ -// Permit: resForCU(1), -// Migrate: nil, -// }) -// // Nothing left to do -// a.Call(nextActions).Equals(core.ActionSet{ -// Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin request tick wait -// }) -//} +// This is distilled from a bug found on staging that resulted in faulty requests to the plugin. +func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { + a := helpers.NewAssert(t) + clock := helpers.NewFakeClock(t) + clockTick := func() { + clock.Inc(100 * time.Millisecond) + } + resForCU := DefaultComputeUnit.Mul + + state := helpers.CreateInitialState( + DefaultInitialStateConfig, + helpers.WithStoredWarnings(a.StoredWarnings()), + // NOTE: current CU is greater than max CU. This is in line with what happens when + // unassigned pooled VMs created by the control plane are first assigned and endpoint and + // must immediately scale down. + helpers.WithMinMaxCU(1, 2), + helpers.WithCurrentCU(3), + ) + nextActions := func() core.ActionSet { + return state.NextActions(clock.Now()) + } + + // Send initial scheduler request - without the monitor active, so we're stuck at 4 CU for now + a. + WithWarnings("Wanted to send vm-monitor downscale request, but there's no active connection"). + Call(state.NextActions, clock.Now()). + Equals(core.ActionSet{ + PluginRequest: &core.ActionPluginRequest{ + LastPermit: nil, + Target: resForCU(3), + Metrics: nil, + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) + clockTick() + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resForCU(3), + Migrate: nil, + }) + + clockTick() + + // Monitor's now active, so we should be asking it for downscaling. + // We don't yet have metrics though, so we only want to downscale as much as is required. + state.Monitor().Active(true) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.8s")}, + MonitorDownscale: &core.ActionMonitorDownscale{ + Current: resForCU(3), + Target: resForCU(2), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(2)) + + // In the middle of the vm-monitor request, update the metrics so that now the desired resource + // usage is actually 1 CU + clockTick() + // the actual metrics we got in the actual logs + metrics := core.SystemMetrics{ + LoadAverage1Min: 0.0, + MemoryUsageBytes: 150589570, // 143.6 MiB + } + a.Do(state.UpdateSystemMetrics, metrics) + + // nothing to do yet, until the existing vm-monitor request finishes + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.7s")}, // plugin request tick wait + }) + + clockTick() + + // When the vm-monitor request finishes, we want to both + // (a) request additional downscaling from vm-monitor, and + // (b) make a NeonVM request for the initially approved downscaling + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now())) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.6s")}, // plugin request tick wait + NeonVMRequest: &core.ActionNeonVMRequest{ + Current: resForCU(3), + Target: resForCU(2), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + MonitorDownscale: &core.ActionMonitorDownscale{ + Current: resForCU(2), + Target: resForCU(1), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + // Start both requests. The vm-monitor request will finish first, but after that we'll just be + // waiting on the NeonVM request (and then redoing a follow-up for more downscaling). + a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) + a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) + + clockTick() + + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now())) + a. + WithWarnings( + "Wanted to make a request to NeonVM API, but there's already NeonVM request (for different resources) ongoing", + ). + Call(nextActions). + Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.5s")}, // plugin request tick wait + }) + + clockTick() + + a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + state.Debug(true) + a. + Call(nextActions). + Equals(core.ActionSet{ + // At this point in the original logs from staging, the intended request to the plugin was + // incorrectly for 1 CU, rather than 2 CU. So, the rest of this test case is mostly just + // rounding out the rest of the scale-down routine. + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(3)), + Target: resForCU(2), + Metrics: lo.ToPtr(metrics.ToAPI()), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + NeonVMRequest: &core.ActionNeonVMRequest{ + Current: resForCU(2), + Target: resForCU(1), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) + a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) + + clockTick() + + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resForCU(2), + Migrate: nil, + }) + // Still waiting for NeonVM request to complete + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin request tick wait + }) + + clockTick() + + // After the NeonVM request finishes, all that we have left to do is inform the plugin of the + // final downscaling. + a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + a.Call(nextActions).Equals(core.ActionSet{ + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(2)), + Target: resForCU(1), + Metrics: lo.ToPtr(metrics.ToAPI()), + TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + }, + }) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) + + clockTick() + + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + Permit: resForCU(1), + Migrate: nil, + }) + // Nothing left to do + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.9s")}, // plugin request tick wait + }) +} From 365af63dbbc5473208ec9a2ba676ec56a5673c6a Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Sun, 14 Jul 2024 22:30:48 +0400 Subject: [PATCH 27/57] fix tests Signed-off-by: Oleg Vasilev --- pkg/agent/core/revsource/revsource_test.go | 28 ++++++++++++---------- pkg/agent/core/state_test.go | 2 +- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/pkg/agent/core/revsource/revsource_test.go b/pkg/agent/core/revsource/revsource_test.go index 75d66f49e..1b5d185df 100644 --- a/pkg/agent/core/revsource/revsource_test.go +++ b/pkg/agent/core/revsource/revsource_test.go @@ -25,9 +25,11 @@ func (trs *testRevisionSource) advance(d time.Duration) { trs.now = v1.NewTime(trs.now.Add(d)) } -func (trs *testRevisionSource) assertResult(d time.Duration) { +func (trs *testRevisionSource) assertResult(d time.Duration, flags vmv1.Flag) { require.NotNil(trs.t, trs.result) assert.Equal(trs.t, d, *trs.result) + require.NotNil(trs.t, trs.resultFlags) + assert.Equal(trs.t, flags, *trs.resultFlags) trs.result = nil } @@ -53,53 +55,53 @@ func TestRevSource(t *testing.T) { trs := newTestRevisionSource(t) // Generate new revision - rev := trs.Next(trs.now.Time) - assert.Equal(t, int64(0), rev.Value) + rev := trs.Next(trs.now.Time, revsource.Upscale) + assert.Equal(t, int64(1), rev.Value) // Observe it coming back in 5 seconds trs.advance(5 * time.Second) err := trs.Observe(trs.now.Time, rev) assert.NoError(t, err) - trs.assertResult(5 * time.Second) + trs.assertResult(5*time.Second, revsource.Upscale) } func TestRevSourceSkip(t *testing.T) { trs := newTestRevisionSource(t) // Generate new clock - rev1 := trs.Next(trs.now.Time) - assert.Equal(t, int64(0), rev1.Value) + rev1 := trs.Next(trs.now.Time, 0) + assert.Equal(t, int64(1), rev1.Value) // Generate another one trs.advance(5 * time.Second) - rev2 := trs.Next(trs.now.Time) - assert.Equal(t, int64(1), rev2.Value) + rev2 := trs.Next(trs.now.Time, 0) + assert.Equal(t, int64(2), rev2.Value) // Observe the first one trs.advance(5 * time.Second) err := trs.Observe(trs.now.Time, rev1) assert.NoError(t, err) - trs.assertResult(10 * time.Second) + trs.assertResult(10*time.Second, 0) // Observe the second one trs.advance(2 * time.Second) err = trs.Observe(trs.now.Time, rev2) assert.NoError(t, err) - trs.assertResult(7 * time.Second) + trs.assertResult(7*time.Second, 0) } func TestStale(t *testing.T) { trs := newTestRevisionSource(t) // Generate new clock - cl := trs.Next(trs.now.Time) - assert.Equal(t, int64(0), cl.Value) + cl := trs.Next(trs.now.Time, 0) + assert.Equal(t, int64(1), cl.Value) // Observe it coming back in 5 seconds trs.advance(5 * time.Second) err := trs.Observe(trs.now.Time, cl) assert.NoError(t, err) - trs.assertResult(5 * time.Second) + trs.assertResult(5*time.Second, 0) // Observe it coming back again trs.advance(5 * time.Second) diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index b7d8b14cc..b86ee72ce 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -235,7 +235,7 @@ func duration(s string) time.Duration { return d } -func zeroRev(flag vmv1.Flag, t time.Time) vmv1.RevisionWithTime { +func zeroRev(_ vmv1.Flag, t time.Time) vmv1.RevisionWithTime { return vmv1.Revision{ Value: 0, Flags: 0, From 7b6b08d8d482946534a11485069a8d43f5cf743a Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Sun, 14 Jul 2024 22:55:33 +0400 Subject: [PATCH 28/57] more changes of wording Signed-off-by: Oleg Vasilev --- neonvm/apis/neonvm/v1/virtualmachine_types.go | 35 +++++---- .../v1/virtualmachinemigration_types.go | 2 +- .../bases/vm.neon.tech_virtualmachines.yaml | 72 ++++++++++--------- neonvm/controllers/vm_controller.go | 8 +-- pkg/agent/core/revsource/revsource.go | 12 ++-- pkg/agent/core/revsource/revsource_test.go | 4 +- pkg/agent/core/state.go | 4 +- pkg/agent/execbridge.go | 4 +- pkg/agent/executor/exec_neonvm.go | 6 +- pkg/agent/runner.go | 6 +- 10 files changed, 76 insertions(+), 77 deletions(-) diff --git a/neonvm/apis/neonvm/v1/virtualmachine_types.go b/neonvm/apis/neonvm/v1/virtualmachine_types.go index 89c5c9fe0..d65a61ae9 100644 --- a/neonvm/apis/neonvm/v1/virtualmachine_types.go +++ b/neonvm/apis/neonvm/v1/virtualmachine_types.go @@ -146,10 +146,10 @@ type VirtualMachineSpec struct { // propagate to the VM. // // If a certain value is written into Spec.TargetRevision together with the changes, and - // the same value is observed in Status.CurrentRevision, it means that the changes have + // the same value is observed in Status.CurrentRevision, it means that the changes were // propagated to the VM. // +optional - TargetRevision *RevisionWithTime `json:"desiredLogicalTime,omitempty"` + TargetRevision *RevisionWithTime `json:"targetRevision,omitempty"` } func (spec *VirtualMachineSpec) Resources() VirtualMachineResources { @@ -226,6 +226,9 @@ func (g Guest) ValidateForMemoryProvider(p MemoryProvider) error { return nil } +// Flag is a bitmask of flags. The meaning is up to the user. +// +// Used in Revision below. type Flag uint64 func (f *Flag) Set(flag Flag) { @@ -236,17 +239,18 @@ func (f *Flag) Clear(flag Flag) { *f &= ^flag } -func (f Flag) Has(flag Flag) bool { - return f&flag != 0 +func (f *Flag) Has(flag Flag) bool { + return *f&flag != 0 } -// Revision allows to assign an identifier to a configuration of a VM. +// Revision is an identifier, which can be assigned to a specific configuration of a VM. // Later it can be used to track the application of the configuration. type Revision struct { Value int64 `json:"value"` Flags Flag `json:"flags"` } +// ZeroRevision is the default value when revisions updates are disabled. var ZeroRevision = Revision{Value: 0, Flags: 0} func (r Revision) Min(other Revision) Revision { @@ -263,7 +267,7 @@ func (r Revision) WithTime(t time.Time) RevisionWithTime { } } -// MarshalLogObject implements zapcore.ObjectMarshaler, so that LogicalTime can be used with zap.Object +// MarshalLogObject implements zapcore.ObjectMarshaler, so that Revision can be used with zap.Object func (r *Revision) MarshalLogObject(enc zapcore.ObjectEncoder) error { enc.AddInt64("value", r.Value) enc.AddUint64("flags", uint64(r.Flags)) @@ -276,20 +280,10 @@ type RevisionWithTime struct { UpdatedAt metav1.Time `json:"updatedAt"` } -// MarshalLogObject implements zapcore.ObjectMarshaler, so that LogicalTime can be used with zap.Object +// MarshalLogObject implements zapcore.ObjectMarshaler, so that RevisionWithTime can be used with zap.Object func (r *RevisionWithTime) MarshalLogObject(enc zapcore.ObjectEncoder) error { - enc.AddInt64("rev", r.Revision.Value) enc.AddTime("updatedAt", r.UpdatedAt.Time) - return nil -} - -func (r *RevisionWithTime) Update(now time.Time, rev Revision) { - r.Revision = rev - r.UpdatedAt = metav1.NewTime(now) -} - -func (t *RevisionWithTime) UpdateNow(rev Revision) { - t.Update(time.Now(), rev) + return r.Revision.MarshalLogObject(enc) } type GuestSettings struct { @@ -579,7 +573,7 @@ type VirtualMachineStatus struct { // Represents the observations of a VirtualMachine's current state. // VirtualMachine.status.conditions.type are: "Available", "Progressing", and "Degraded" // VirtualMachine.status.conditions.status are one of True, False, Unknown. - // VirtualMachine.status.conditions.reason the Value should be a CamelCase string and producers of specific + // VirtualMachine.status.conditions.reason the value should be a CamelCase string and producers of specific // condition types may define expected values and meanings for this field, and whether the values // are considered a guaranteed API. // VirtualMachine.status.conditions.Message is a human readable message indicating details about the transition. @@ -611,6 +605,9 @@ type VirtualMachineStatus struct { MemoryProvider *MemoryProvider `json:"memoryProvider,omitempty"` // +optional SSHSecretName string `json:"sshSecretName,omitempty"` + + // CurrentRevision is updated with Spec.TargetRevision's value once + // the changes are propagated to the VM. // +optional CurrentRevision *RevisionWithTime `json:"currentRevision,omitempty"` } diff --git a/neonvm/apis/neonvm/v1/virtualmachinemigration_types.go b/neonvm/apis/neonvm/v1/virtualmachinemigration_types.go index d909e7fcf..3de246363 100644 --- a/neonvm/apis/neonvm/v1/virtualmachinemigration_types.go +++ b/neonvm/apis/neonvm/v1/virtualmachinemigration_types.go @@ -73,7 +73,7 @@ type VirtualMachineMigrationStatus struct { // Represents the observations of a VirtualMachineMigration's current state. // VirtualMachineMigration.status.conditions.type are: "Available", "Progressing", and "Degraded" // VirtualMachineMigration.status.conditions.status are one of True, False, Unknown. - // VirtualMachineMigration.status.conditions.reason the Value should be a CamelCase string and producers of specific + // VirtualMachineMigration.status.conditions.reason the value should be a CamelCase string and producers of specific // condition types may define expected values and meanings for this field, and whether the values // are considered a guaranteed API. // VirtualMachineMigration.status.conditions.Message is a human readable message indicating details about the transition. diff --git a/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml b/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml index cb3d04d3b..805da9c0f 100644 --- a/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml +++ b/neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml @@ -892,35 +892,6 @@ spec: type: array type: object type: object - desiredLogicalTime: - description: "TargetRevision is the identifier set by external party - to track when changes to the spec propagate to the VM. \n If a certain - value is written into Spec.TargetRevision together with the changes, - and the same value is observed in Status.CurrentRevision, it means - that the changes have propagated to the VM." - properties: - revision: - description: Revision allows to assign an identifier to a configuration - of a VM. Later it can be used to track the application of the - configuration. - properties: - flags: - format: int64 - type: integer - value: - format: int64 - type: integer - required: - - flags - - value - type: object - updatedAt: - format: date-time - type: string - required: - - revision - - updatedAt - type: object disks: description: List of disk that can be mounted by virtual machine. items: @@ -2691,6 +2662,37 @@ spec: type: boolean serviceAccountName: type: string + targetRevision: + description: "TargetRevision is the identifier set by external party + to track when changes to the spec propagate to the VM. \n If a certain + value is written into Spec.TargetRevision together with the changes, + and the same value is observed in Status.CurrentRevision, it means + that the changes were propagated to the VM." + properties: + revision: + description: Revision is an identifier, which can be assigned + to a specific configuration of a VM. Later it can be used to + track the application of the configuration. + properties: + flags: + description: "Flag is a bitmask of flags. The meaning is up + to the user. \n Used in Revision below." + format: int64 + type: integer + value: + format: int64 + type: integer + required: + - flags + - value + type: object + updatedAt: + format: date-time + type: string + required: + - revision + - updatedAt + type: object terminationGracePeriodSeconds: default: 5 format: int64 @@ -2816,15 +2818,17 @@ spec: type: integer x-kubernetes-int-or-string: true currentRevision: - description: RevisionWithTime contains a Revision and the time it - was last updated. + description: CurrentRevision is updated with Spec.TargetRevision's + value once the changes are propagated to the VM. properties: revision: - description: Revision allows to assign an identifier to a configuration - of a VM. Later it can be used to track the application of the - configuration. + description: Revision is an identifier, which can be assigned + to a specific configuration of a VM. Later it can be used to + track the application of the configuration. properties: flags: + description: "Flag is a bitmask of flags. The meaning is up + to the user. \n Used in Revision below." format: int64 type: integer value: diff --git a/neonvm/controllers/vm_controller.go b/neonvm/controllers/vm_controller.go index 7a4962714..5491e48a2 100644 --- a/neonvm/controllers/vm_controller.go +++ b/neonvm/controllers/vm_controller.go @@ -800,11 +800,9 @@ func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine) // do nothing } - if vm.Status.Phase == vmv1.VmRunning { - if vm.Spec.TargetRevision != nil { - rev := vm.Spec.TargetRevision.WithTime(time.Now()) - vm.Status.CurrentRevision = &rev - } + if vm.Status.Phase == vmv1.VmRunning && vm.Spec.TargetRevision != nil { + rev := vm.Spec.TargetRevision.WithTime(time.Now()) + vm.Status.CurrentRevision = &rev } return nil diff --git a/pkg/agent/core/revsource/revsource.go b/pkg/agent/core/revsource/revsource.go index 38306df77..0702dabd7 100644 --- a/pkg/agent/core/revsource/revsource.go +++ b/pkg/agent/core/revsource/revsource.go @@ -30,15 +30,15 @@ func FlagsToLabels(flags vmv1.Flag) []string { return ret } -// RevisionSource can generate and observe logical time. -// Each logical timestamp is associated with a physical timestamp and a set of flags upon creation. -// Once RevisionSource observes a previously generated timestamp after some time, it will call the callback with -// the time difference and the flags associated with the timestamp. +// RevisionSource can generate and observe revisions. +// Each Revision is a value and a set of flags (for meta-information). +// Once RevisionSource observes a previously generated Revision after some time, +// the time it took since that Revision was generated. type RevisionSource struct { cb func(time.Duration, vmv1.Flag) - // The in-flight timestamps are stored in-order. - // After the timestamp is observed, it is removed from the measurements, and the offset is increased. + // The in-flight revisions are stored in-order. + // After the revision is observed, it is removed from the measurements, and the offset is increased. measurements []time.Time offset int64 } diff --git a/pkg/agent/core/revsource/revsource_test.go b/pkg/agent/core/revsource/revsource_test.go index 1b5d185df..2a663c61f 100644 --- a/pkg/agent/core/revsource/revsource_test.go +++ b/pkg/agent/core/revsource/revsource_test.go @@ -68,7 +68,7 @@ func TestRevSource(t *testing.T) { func TestRevSourceSkip(t *testing.T) { trs := newTestRevisionSource(t) - // Generate new clock + // Generate new revision rev1 := trs.Next(trs.now.Time, 0) assert.Equal(t, int64(1), rev1.Value) @@ -93,7 +93,7 @@ func TestRevSourceSkip(t *testing.T) { func TestStale(t *testing.T) { trs := newTestRevisionSource(t) - // Generate new clock + // Generate new revision cl := trs.Next(trs.now.Time, 0) assert.Equal(t, int64(1), cl.Value) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index e24a9d2f8..9a3308130 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -75,7 +75,7 @@ type Config struct { // about conditions that are impeding its ability to execute. Log LogConfig `json:"-"` - // RevisionSource is the source of logical timestamps for the autoscaler-agent. + // RevisionSource is the source of revisions to track the progress during scaling. RevisionSource RevisionSource `json:"-"` } @@ -864,7 +864,7 @@ func (s *state) updateTargetRevision( } } else { if *s.LastDesiredResources == desired { - // Nothing changed, so no need to update the logical time + // Nothing changed, so no need to update the target revision return } } diff --git a/pkg/agent/execbridge.go b/pkg/agent/execbridge.go index 60a0136cd..78b9b79e9 100644 --- a/pkg/agent/execbridge.go +++ b/pkg/agent/execbridge.go @@ -91,11 +91,11 @@ func (iface *execNeonVMInterface) Request( ctx context.Context, logger *zap.Logger, current, target api.Resources, - currentRevision vmv1.RevisionWithTime, + targetRevision vmv1.RevisionWithTime, ) error { iface.runner.recordResourceChange(current, target, iface.runner.global.metrics.neonvmRequestedChange) - err := iface.runner.doNeonVMRequest(ctx, target, currentRevision) + err := iface.runner.doNeonVMRequest(ctx, target, targetRevision) if err != nil { iface.runner.status.update(iface.runner.global, func(ps podStatus) podStatus { ps.failedNeonVMRequestCounter.Inc() diff --git a/pkg/agent/executor/exec_neonvm.go b/pkg/agent/executor/exec_neonvm.go index 358f6a4c8..fcd6a8811 100644 --- a/pkg/agent/executor/exec_neonvm.go +++ b/pkg/agent/executor/exec_neonvm.go @@ -17,7 +17,7 @@ type NeonVMInterface interface { _ context.Context, _ *zap.Logger, current, target api.Resources, - currentRevision vmv1.RevisionWithTime, + targetRevision vmv1.RevisionWithTime, ) error } @@ -53,8 +53,8 @@ func (c *ExecutorCoreWithClients) DoNeonVMRequests(ctx context.Context, logger * } endTime := time.Now() - currentRevision := action.TargetRevision.WithTime(endTime) - err := c.clients.NeonVM.Request(ctx, ifaceLogger, action.Current, action.Target, currentRevision) + targetRevision := action.TargetRevision.WithTime(endTime) + err := c.clients.NeonVM.Request(ctx, ifaceLogger, action.Current, action.Target, targetRevision) logFields := []zap.Field{zap.Object("action", action), zap.Duration("duration", endTime.Sub(startTime))} diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go index 3ec92bfe0..2ff7e696d 100644 --- a/pkg/agent/runner.go +++ b/pkg/agent/runner.go @@ -636,7 +636,7 @@ func doMetricsRequest( func (r *Runner) doNeonVMRequest( ctx context.Context, target api.Resources, - currentRevision vmv1.RevisionWithTime, + targetRevision vmv1.RevisionWithTime, ) error { patches := []patch.Operation{{ Op: patch.OpReplace, @@ -648,8 +648,8 @@ func (r *Runner) doNeonVMRequest( Value: uint32(target.Mem / r.memSlotSize), }, { Op: patch.OpReplace, - Path: "/spec/desiredLogicalTime", - Value: currentRevision, + Path: "/spec/targetRevision", + Value: targetRevision, }} patchPayload, err := json.Marshal(patches) From af5b68316263419d2f276d5cbb71f681dc21f05d Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Sun, 14 Jul 2024 23:10:03 +0400 Subject: [PATCH 29/57] codestyle fixes Signed-off-by: Oleg Vasilev --- pkg/agent/core/state_test.go | 43 ++++++++++++++++------------ pkg/agent/core/testhelpers/assert.go | 13 --------- pkg/agent/prommetrics.go | 10 +++++-- 3 files changed, 32 insertions(+), 34 deletions(-) diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index b86ee72ce..1a53adf01 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -85,6 +85,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { for _, c := range cases { warnings := []string{} + state := core.NewState( api.VmInfo{ Name: "test", @@ -208,19 +209,26 @@ func getDesiredResources(state *core.State, now time.Time) api.Resources { return res } -func doInitialPluginRequest(a helpers.Assert, state *core.State, clock *helpers.FakeClock, requestTime time.Duration, metrics *api.Metrics, resources api.Resources) { - rev := vmv1.ZeroRevision +func doInitialPluginRequest( + a helpers.Assert, + state *core.State, + clock *helpers.FakeClock, + requestTime time.Duration, + metrics *api.Metrics, + resources api.Resources, +) { + rev := vmv1.ZeroRevision.WithTime(clock.Now()) a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ LastPermit: nil, Target: resources, Metrics: metrics, - TargetRevision: rev.WithTime(clock.Now()), + TargetRevision: rev, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resources) clock.Inc(requestTime) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), rev.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), rev, api.PluginResponse{ Permit: resources, Migrate: nil, }) @@ -235,11 +243,8 @@ func duration(s string) time.Duration { return d } -func zeroRev(_ vmv1.Flag, t time.Time) vmv1.RevisionWithTime { - return vmv1.Revision{ - Value: 0, - Flags: 0, - }.WithTime(t) +func zeroRev(t time.Time) vmv1.RevisionWithTime { + return vmv1.ZeroRevision.WithTime(t) } // Thorough checks of a relatively simple flow - scaling from 1 CU to 2 CU and back down. @@ -599,12 +604,12 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { MonitorDownscale: &core.ActionMonitorDownscale{ Current: resForCU(6), Target: resForCU(5), - TargetRevision: zeroRev(revsource.Downscale, clock.Now()), + TargetRevision: zeroRev(clock.Now()), }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(5)) clockTick() - a.Do(state.Monitor().DownscaleRequestDenied, clock.Now(), zeroRev(revsource.Downscale, clock.Now())) + a.Do(state.Monitor().DownscaleRequestDenied, clock.Now(), zeroRev(clock.Now())) // At the end, we should be waiting to retry downscaling: a.Call(nextActions).Equals(core.ActionSet{ @@ -622,7 +627,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { expectedNeonVMRequest = &core.ActionNeonVMRequest{ Current: resForCU(6), Target: resForCU(cu + 1), - TargetRevision: zeroRev(revsource.Downscale, clock.Now()), + TargetRevision: zeroRev(clock.Now()), } } @@ -631,7 +636,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { MonitorDownscale: &core.ActionMonitorDownscale{ Current: resForCU(cu + 1), Target: resForCU(cu), - TargetRevision: zeroRev(revsource.Downscale, clock.Now()), + TargetRevision: zeroRev(clock.Now()), }, NeonVMRequest: expectedNeonVMRequest, }) @@ -643,9 +648,9 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { clockTick() currentPluginWait -= clockTickDuration if cu >= 3 /* allow down to 3 */ { - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), zeroRev(revsource.Downscale, clock.Now())) + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), zeroRev(clock.Now())) } else { - a.Do(state.Monitor().DownscaleRequestDenied, clock.Now(), zeroRev(revsource.Downscale, clock.Now())) + a.Do(state.Monitor().DownscaleRequestDenied, clock.Now(), zeroRev(clock.Now())) } } // At this point, waiting 3.7s for next attempt to downscale below 3 CU (last request was @@ -656,7 +661,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(6), Target: resForCU(3), - TargetRevision: zeroRev(revsource.Downscale, clock.Now()), + TargetRevision: zeroRev(clock.Now()), }, }) // Make the request: @@ -674,7 +679,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { LastPermit: lo.ToPtr(resForCU(6)), Target: resForCU(3), Metrics: lo.ToPtr(metrics.ToAPI()), - TargetRevision: zeroRev(revsource.Downscale, clock.Now()), + TargetRevision: zeroRev(clock.Now()), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) @@ -855,7 +860,7 @@ func TestRequestedUpscale(t *testing.T) { LastPermit: lo.ToPtr(resForCU(1)), Target: resForCU(2), Metrics: lo.ToPtr(lastMetrics.ToAPI()), - TargetRevision: zeroRev(revsource.Upscale|revsource.Immediate, clock.Now()), + TargetRevision: zeroRev(clock.Now()), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) @@ -887,7 +892,7 @@ func TestRequestedUpscale(t *testing.T) { MonitorUpscale: &core.ActionMonitorUpscale{ Current: resForCU(1), Target: resForCU(2), - TargetRevision: zeroRev(revsource.Upscale|revsource.Immediate, clock.Now()), + TargetRevision: zeroRev(clock.Now()), }, }) a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) diff --git a/pkg/agent/core/testhelpers/assert.go b/pkg/agent/core/testhelpers/assert.go index 23d460eef..19ecdb228 100644 --- a/pkg/agent/core/testhelpers/assert.go +++ b/pkg/agent/core/testhelpers/assert.go @@ -60,19 +60,6 @@ func (a Assert) NoError(f any, args ...any) { a.Call(f, args...).Equals(nil) } -// SafeVal creates a safe value that can be used in Assert.Call() call. -// -// We have to use this function because calling the Assert.Call() method with a -// nil parameter can cause a panic like: -// panic: reflect: Call using zero Value argument... -func SafeVal[T any](i any) (v reflect.Value) { - v = reflect.ValueOf(i) - if i == nil { - v = reflect.Zero(reflect.TypeOf((*T)(nil))) - } - return -} - // Call sets up a prepared function call, which will not be executed until one of its methods is // actually called, which will perform all the relevant checks. // diff --git a/pkg/agent/prommetrics.go b/pkg/agent/prommetrics.go index ebead8c46..1afbaa837 100644 --- a/pkg/agent/prommetrics.go +++ b/pkg/agent/prommetrics.go @@ -51,6 +51,11 @@ const ( runnerMetricStatePanicked runnerMetricState = "panicked" ) +// Copied bucket values from controller runtime latency metric. We can +// adjust them in the future if needed. +var buckets = []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, + 1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60} + func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { reg := prometheus.NewRegistry() @@ -223,8 +228,9 @@ func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { scalingLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Name: "autoscaling_agent_scaling_latency_seconds", - Help: "End-to-end scaling latency", + Name: "autoscaling_agent_scaling_latency_seconds", + Help: "End-to-end scaling latency", + Buckets: buckets, }, revsource.AllFlagNames, )), } From fc436aa70cbce3553b01dd015c74adf7a67d2998 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 15 Jul 2024 12:49:38 +0400 Subject: [PATCH 30/57] add aux metrics Signed-off-by: Oleg Vasilev --- pkg/agent/core/dumpstate.go | 2 + pkg/agent/core/revsource/revsource.go | 36 ++++++++++++++- pkg/agent/core/state.go | 50 +++++++++++++++++---- pkg/agent/core/state_test.go | 64 ++++++++++++++++++--------- pkg/agent/prommetrics.go | 36 +++++++++++++++ pkg/agent/runner.go | 13 +++--- 6 files changed, 165 insertions(+), 36 deletions(-) diff --git a/pkg/agent/core/dumpstate.go b/pkg/agent/core/dumpstate.go index 7b3812135..3e4d9353d 100644 --- a/pkg/agent/core/dumpstate.go +++ b/pkg/agent/core/dumpstate.go @@ -73,5 +73,7 @@ func (s *neonvmState) deepCopy() neonvmState { LastSuccess: shallowCopy[api.Resources](s.LastSuccess), OngoingRequested: shallowCopy[api.Resources](s.OngoingRequested), RequestFailedAt: shallowCopy[time.Time](s.RequestFailedAt), + TargetRevision: s.TargetRevision, + CurrentRevision: s.CurrentRevision, } } diff --git a/pkg/agent/core/revsource/revsource.go b/pkg/agent/core/revsource/revsource.go index 0702dabd7..fe7ff4107 100644 --- a/pkg/agent/core/revsource/revsource.go +++ b/pkg/agent/core/revsource/revsource.go @@ -4,6 +4,8 @@ import ( "errors" "time" + "github.com/prometheus/client_golang/prometheus" + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" ) @@ -35,7 +37,7 @@ func FlagsToLabels(flags vmv1.Flag) []string { // Once RevisionSource observes a previously generated Revision after some time, // the time it took since that Revision was generated. type RevisionSource struct { - cb func(time.Duration, vmv1.Flag) + cb MetricCB // The in-flight revisions are stored in-order. // After the revision is observed, it is removed from the measurements, and the offset is increased. @@ -43,7 +45,7 @@ type RevisionSource struct { offset int64 } -func NewRevisionSource(cb func(time.Duration, vmv1.Flag)) *RevisionSource { +func NewRevisionSource(cb MetricCB) *RevisionSource { return &RevisionSource{ cb: cb, measurements: nil, @@ -97,3 +99,33 @@ func (c *NilRevisionSource) Next(_ time.Time, _ vmv1.Flag) vmv1.Revision { } } func (c *NilRevisionSource) Observe(_ time.Time, _ vmv1.Revision) error { return nil } + +type MetricCB func(dur time.Duration, flags vmv1.Flag) + +func WrapHistogramVec(hist *prometheus.HistogramVec) MetricCB { + return func(dur time.Duration, flags vmv1.Flag) { + labels := FlagsToLabels(flags) + hist.WithLabelValues(labels...).Observe(dur.Seconds()) + } +} + +// Propagate sets the target revision to be current, optionally measuring the time it took +// for propagation. +func Propagate( + now time.Time, + target vmv1.RevisionWithTime, + currentSlot *vmv1.Revision, + metricCB MetricCB, +) { + if metricCB != nil { + diff := now.Sub(target.UpdatedAt.Time) + metricCB(diff, target.Flags) + } + if currentSlot == nil { + return + } + if currentSlot.Value > target.Value { + return + } + *currentSlot = target.Revision +} diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 9a3308130..49feef993 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -36,6 +36,12 @@ import ( "github.com/neondatabase/autoscaling/pkg/util" ) +type PromMetricsCallbacks struct { + PluginLatency revsource.MetricCB + MonitorLatency revsource.MetricCB + NeonVMLatency revsource.MetricCB +} + // Config represents some of the static configuration underlying the decision-making of State type Config struct { // ComputeUnit is the desired ratio between CPU and memory, copied from the global @@ -77,6 +83,9 @@ type Config struct { // RevisionSource is the source of revisions to track the progress during scaling. RevisionSource RevisionSource `json:"-"` + + // PromMetricsCallbacks are the callbacks to update the Prometheus metrics. + PromMetricsCallbacks PromMetricsCallbacks `json:"-"` } type LogConfig struct { @@ -207,6 +216,9 @@ type neonvmState struct { // OngoingRequested, if not nil, gives the resources requested OngoingRequested *api.Resources RequestFailedAt *time.Time + + TargetRevision vmv1.RevisionWithTime + CurrentRevision vmv1.Revision } func (ns *neonvmState) ongoingRequest() bool { @@ -244,6 +256,8 @@ func NewState(vm api.VmInfo, config Config) *State { LastSuccess: nil, OngoingRequested: nil, RequestFailedAt: nil, + TargetRevision: vmv1.ZeroRevision.WithTime(time.Time{}), + CurrentRevision: vmv1.ZeroRevision, }, Metrics: nil, LastDesiredResources: nil, @@ -501,10 +515,11 @@ func (s *state) calculateNeonVMAction( } } + s.NeonVM.TargetRevision = targetRevision.WithTime(now) return &ActionNeonVMRequest{ Current: s.VM.Using(), Target: desiredResources, - TargetRevision: targetRevision.WithTime(now), + TargetRevision: s.NeonVM.TargetRevision, }, nil } else { var reqs []string @@ -885,8 +900,13 @@ func (s *state) updateTargetRevision( } -func (s *state) updateCurrentRevision(rev vmv1.RevisionWithTime) { - err := s.Config.RevisionSource.Observe(rev.UpdatedAt.Time, rev.Revision) +func (s *state) updateNeonVMCurrentRevision(currentRevision vmv1.RevisionWithTime) { + revsource.Propagate(currentRevision.UpdatedAt.Time, + s.NeonVM.TargetRevision, + &s.NeonVM.CurrentRevision, + s.Config.PromMetricsCallbacks.NeonVMLatency, + ) + err := s.Config.RevisionSource.Observe(currentRevision.UpdatedAt.Time, currentRevision.Revision) if err != nil { s.warnf("Failed to observe clock source: %v", err) } @@ -1028,7 +1048,7 @@ func (s *State) UpdatedVM(vm api.VmInfo) { vm.SetUsing(s.internal.VM.Using()) s.internal.VM = vm if vm.CurrentRevision != nil { - s.internal.updateCurrentRevision(*vm.CurrentRevision) + s.internal.updateNeonVMCurrentRevision(*vm.CurrentRevision) } } @@ -1064,7 +1084,7 @@ func (h PluginHandle) RequestFailed(now time.Time) { func (h PluginHandle) RequestSuccessful( now time.Time, - rev vmv1.RevisionWithTime, + targetRevision vmv1.RevisionWithTime, resp api.PluginResponse, ) (_err error) { h.s.Plugin.OngoingRequest = false @@ -1098,7 +1118,11 @@ func (h PluginHandle) RequestSuccessful( // the process of moving the source of truth for ComputeUnit from the scheduler plugin to the // autoscaler-agent. h.s.Plugin.Permit = &resp.Permit - h.s.Plugin.CurrentRevision = rev.Revision + revsource.Propagate(now, + targetRevision, + &h.s.Plugin.CurrentRevision, + h.s.Config.PromMetricsCallbacks.PluginLatency, + ) return nil } @@ -1169,18 +1193,26 @@ func (h MonitorHandle) StartingDownscaleRequest(now time.Time, resources api.Res func (h MonitorHandle) DownscaleRequestAllowed(now time.Time, rev vmv1.RevisionWithTime) { h.s.Monitor.Approved = &h.s.Monitor.OngoingRequest.Requested h.s.Monitor.OngoingRequest = nil - h.s.Monitor.CurrentRevision = rev.Revision + revsource.Propagate(now, + rev, + &h.s.Monitor.CurrentRevision, + h.s.Config.PromMetricsCallbacks.MonitorLatency, + ) } // Downscale request was successful but the monitor denied our request. -func (h MonitorHandle) DownscaleRequestDenied(now time.Time, rev vmv1.RevisionWithTime) { +func (h MonitorHandle) DownscaleRequestDenied(now time.Time, targetRevision vmv1.RevisionWithTime) { h.s.Monitor.DeniedDownscale = &deniedDownscale{ At: now, Current: *h.s.Monitor.Approved, Requested: h.s.Monitor.OngoingRequest.Requested, } h.s.Monitor.OngoingRequest = nil - h.s.Monitor.CurrentRevision = rev.Revision + revsource.Propagate(now, + targetRevision, + &h.s.Monitor.CurrentRevision, + h.s.Config.PromMetricsCallbacks.MonitorLatency, + ) } func (h MonitorHandle) DownscaleRequestFailed(now time.Time) { diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 1a53adf01..7790d7456 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -132,6 +132,11 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { }, }, RevisionSource: revsource.NewRevisionSource(nil), + PromMetricsCallbacks: core.PromMetricsCallbacks{ + PluginLatency: nil, + MonitorLatency: nil, + NeonVMLatency: nil, + }, }, ) @@ -201,6 +206,11 @@ var DefaultInitialStateConfig = helpers.InitialStateConfig{ Warn: nil, }, RevisionSource: &revsource.NilRevisionSource{}, + PromMetricsCallbacks: core.PromMetricsCallbacks{ + PluginLatency: nil, + MonitorLatency: nil, + NeonVMLatency: nil, + }, }, } @@ -247,6 +257,33 @@ func zeroRev(t time.Time) vmv1.RevisionWithTime { return vmv1.ZeroRevision.WithTime(t) } +type latencyObserver struct { + t *testing.T + observations []struct { + latency time.Duration + flags vmv1.Flag + } +} + +func (a *latencyObserver) observe(latency time.Duration, flags vmv1.Flag) { + a.observations = append(a.observations, struct { + latency time.Duration + flags vmv1.Flag + }{latency, flags}) +} + +func (a *latencyObserver) assert(latency time.Duration, flags vmv1.Flag) { + require.NotEmpty(a.t, a.observations) + assert.Equal(a.t, a.observations[0].latency, latency) + assert.Equal(a.t, a.observations[0].flags, flags) + a.observations = a.observations[1:] +} + +// assertEmpty should be called in defer +func (a *latencyObserver) assertEmpty() { + assert.Empty(a.t, a.observations) +} + // Thorough checks of a relatively simple flow - scaling from 1 CU to 2 CU and back down. func TestBasicScaleUpAndDownFlow(t *testing.T) { a := helpers.NewAssert(t) @@ -256,20 +293,13 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { } resForCU := DefaultComputeUnit.Mul - var latencyObservations []struct { - latency time.Duration - flags vmv1.Flag - } + latencyObserver := &latencyObserver{t: t, observations: nil} + defer latencyObserver.assertEmpty() expectedRevision := vmv1.ZeroRevision cfg := DefaultInitialStateConfig - cfg.Core.RevisionSource = revsource.NewRevisionSource(func(latency time.Duration, flags vmv1.Flag) { - latencyObservations = append(latencyObservations, struct { - latency time.Duration - flags vmv1.Flag - }{latency, flags}) - }) + cfg.Core.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) state := helpers.CreateInitialState( cfg, @@ -341,7 +371,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { }) // Until NeonVM is successful, we won't see any observations. - assert.Empty(t, latencyObservations) + latencyObserver.assertEmpty() // Now NeonVM request is done. a.Do(state.NeonVM().RequestSuccessful, clock.Now()) @@ -352,11 +382,8 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { vmInfo.CurrentRevision = &rev a.Do(state.UpdatedVM, vmInfo) - // And we see the latency - require.Len(t, latencyObservations, 1) - // We started at 0.2s and finished at 0.4s - assert.Equal(t, duration("0.2s"), latencyObservations[0].latency) - assert.Equal(t, revsource.Upscale, latencyObservations[0].flags) + // And we see the latency. We started at 0.2s and finished at 0.4s + latencyObserver.assert(duration("0.2s"), revsource.Upscale) // NeonVM change is done, now we should finish by notifying the vm-monitor a.Call(nextActions).Equals(core.ActionSet{ @@ -444,11 +471,8 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { vmInfo.CurrentRevision = &rev a.Do(state.UpdatedVM, vmInfo) - // One more latency observation - require.Len(t, latencyObservations, 2) // We started at 0.6s and finished at 0.9s - assert.Equal(t, duration("0.3s"), latencyObservations[1].latency) - assert.Equal(t, revsource.Downscale, latencyObservations[1].flags) + latencyObserver.assert(duration("0.3s"), revsource.Downscale) // Request to NeonVM completed, it's time to inform the scheduler plugin: a.Call(nextActions).Equals(core.ActionSet{ diff --git a/pkg/agent/prommetrics.go b/pkg/agent/prommetrics.go index 1afbaa837..f4bd5c998 100644 --- a/pkg/agent/prommetrics.go +++ b/pkg/agent/prommetrics.go @@ -29,6 +29,21 @@ type GlobalMetrics struct { runnerNextActions prometheus.Counter scalingLatency prometheus.HistogramVec + pluginLatency prometheus.HistogramVec + monitorLatency prometheus.HistogramVec + neonvmLatency prometheus.HistogramVec +} + +func (m *GlobalMetrics) PluginLatency() *prometheus.HistogramVec { + return &m.pluginLatency +} + +func (m *GlobalMetrics) MonitorLatency() *prometheus.HistogramVec { + return &m.monitorLatency +} + +func (m *GlobalMetrics) NeonVMLatency() *prometheus.HistogramVec { + return &m.neonvmLatency } type resourceChangePair struct { @@ -233,6 +248,27 @@ func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { Buckets: buckets, }, revsource.AllFlagNames, )), + pluginLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "autoscaling_agent_plugin_latency_seconds", + Help: "Plugin request latency", + Buckets: buckets, + }, revsource.AllFlagNames, + )), + monitorLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "autoscaling_agent_monitor_latency_seconds", + Help: "Monitor request latency", + Buckets: buckets, + }, revsource.AllFlagNames, + )), + neonvmLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "autoscaling_agent_neonvm_latency_seconds", + Help: "NeonVM request latency", + Buckets: buckets, + }, revsource.AllFlagNames, + )), } // Some of of the metrics should have default keys set to zero. Otherwise, these won't be filled diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go index 2ff7e696d..f87c8ce68 100644 --- a/pkg/agent/runner.go +++ b/pkg/agent/runner.go @@ -196,11 +196,9 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util pluginRequestJitter := util.NewTimeRange(time.Millisecond, 0, 100).Random() coreExecLogger := execLogger.Named("core") - revisionSource := revsource.NewRevisionSource(func(duration time.Duration, flags vmv1.Flag) { - r.global.metrics.scalingLatency. - WithLabelValues(revsource.FlagsToLabels(flags)...). - Observe(duration.Seconds()) - }) + revisionSource := revsource.NewRevisionSource( + revsource.WrapHistogramVec(&r.global.metrics.scalingLatency), + ) executorCore := executor.NewExecutorCore(coreExecLogger, getVmInfo(), executor.Config{ OnNextActions: r.global.metrics.runnerNextActions.Inc, Core: core.Config{ @@ -218,6 +216,11 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util Warn: coreExecLogger.Warn, }, RevisionSource: revisionSource, + PromMetricsCallbacks: core.PromMetricsCallbacks{ + PluginLatency: revsource.WrapHistogramVec(&r.global.metrics.pluginLatency), + MonitorLatency: revsource.WrapHistogramVec(&r.global.metrics.monitorLatency), + NeonVMLatency: revsource.WrapHistogramVec(&r.global.metrics.neonvmLatency), + }, }, }) From df45965f76b6339cf6bdfc2838b2745f44aff040 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 15 Jul 2024 13:10:12 +0400 Subject: [PATCH 31/57] test minor latency Signed-off-by: Oleg Vasilev --- pkg/agent/core/state_test.go | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 7790d7456..25cb881ce 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -274,8 +274,8 @@ func (a *latencyObserver) observe(latency time.Duration, flags vmv1.Flag) { func (a *latencyObserver) assert(latency time.Duration, flags vmv1.Flag) { require.NotEmpty(a.t, a.observations) - assert.Equal(a.t, a.observations[0].latency, latency) - assert.Equal(a.t, a.observations[0].flags, flags) + assert.Equal(a.t, latency, a.observations[0].latency) + assert.Equal(a.t, flags, a.observations[0].flags) a.observations = a.observations[1:] } @@ -504,8 +504,14 @@ func TestPeriodicPluginRequest(t *testing.T) { a := helpers.NewAssert(t) clock := helpers.NewFakeClock(t) + cfg := DefaultInitialStateConfig + latencyObserver := &latencyObserver{t: t, observations: nil} + defer latencyObserver.assertEmpty() + cfg.Core.PromMetricsCallbacks.PluginLatency = latencyObserver.observe + + cfg.Core.RevisionSource = revsource.NewRevisionSource(nil) state := helpers.CreateInitialState( - DefaultInitialStateConfig, + cfg, helpers.WithStoredWarnings(a.StoredWarnings()), ) @@ -528,6 +534,7 @@ func TestPeriodicPluginRequest(t *testing.T) { endTime := duration("20s") doInitialPluginRequest(a, state, clock, clockTick, lo.ToPtr(metrics.ToAPI()), resources) + latencyObserver.assert(duration("100ms"), 0) for clock.Elapsed().Duration < endTime { timeSinceScheduledRequest := (clock.Elapsed().Duration - base) % reqEvery @@ -539,23 +546,25 @@ func TestPeriodicPluginRequest(t *testing.T) { }) clock.Inc(clockTick) } else { + target := vmv1.ZeroRevision.WithTime(clock.Now()) a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ LastPermit: &resources, Target: resources, Metrics: lo.ToPtr(metrics.ToAPI()), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: target, }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resources) a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{}) clock.Inc(reqDuration) a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{}) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), target, api.PluginResponse{ Permit: resources, Migrate: nil, }) clock.Inc(clockTick - reqDuration) + latencyObserver.assert(reqDuration, 0) } } } From 1c802017044cdfd81f01fc9b00ab100a02687cc8 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 15 Jul 2024 13:28:33 +0400 Subject: [PATCH 32/57] couple more tests Signed-off-by: Oleg Vasilev --- pkg/agent/core/state_test.go | 53 +++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 25cb881ce..c74f3a8f7 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -293,14 +293,12 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { } resForCU := DefaultComputeUnit.Mul + cfg := DefaultInitialStateConfig latencyObserver := &latencyObserver{t: t, observations: nil} defer latencyObserver.assertEmpty() - - expectedRevision := vmv1.ZeroRevision - cfg := DefaultInitialStateConfig - cfg.Core.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) + expectedRevision := vmv1.ZeroRevision state := helpers.CreateInitialState( cfg, helpers.WithStoredWarnings(a.StoredWarnings()), @@ -327,7 +325,9 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Equals(resForCU(2)) // Now that the initial scheduler request is done, and we have metrics that indicate - // scale-up would be a good idea. Revision advances. + // scale-up would be a good idea, we should be contacting the scheduler to get approval. + + // Revision advances. expectedRevision.Value = 1 expectedRevision.Flags = revsource.Upscale @@ -460,18 +460,21 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { }) a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - // Update the VM to set current=1, but first wait 0.1s + // Request to NeonVM is successful, but let's wait one more tick for + // NeonVM to pick up the changes and apply those. clockTick().AssertEquals(duration("0.9s")) vmInfo = helpers.CreateVmInfo( DefaultInitialStateConfig.VM, helpers.WithCurrentCU(1), helpers.WithMinMaxCU(1, 1), ) + + // This means that the NeonVM has applied the changes. rev = expectedRevision.WithTime(clock.Now()) vmInfo.CurrentRevision = &rev a.Do(state.UpdatedVM, vmInfo) - // We started at 0.6s and finished at 0.9s + // We started at 0.6s and finished at 0.9s. latencyObserver.assert(duration("0.3s"), revsource.Downscale) // Request to NeonVM completed, it's time to inform the scheduler plugin: @@ -507,6 +510,7 @@ func TestPeriodicPluginRequest(t *testing.T) { cfg := DefaultInitialStateConfig latencyObserver := &latencyObserver{t: t, observations: nil} defer latencyObserver.assertEmpty() + // This time, we will test plugin latency cfg.Core.PromMetricsCallbacks.PluginLatency = latencyObserver.observe cfg.Core.RevisionSource = revsource.NewRevisionSource(nil) @@ -855,8 +859,14 @@ func TestRequestedUpscale(t *testing.T) { } resForCU := DefaultComputeUnit.Mul + cfg := DefaultInitialStateConfig + latencyObserver := &latencyObserver{t: t, observations: nil} + defer latencyObserver.assertEmpty() + cfg.Core.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) + + expectedRevision := vmv1.ZeroRevision state := helpers.CreateInitialState( - DefaultInitialStateConfig, + cfg, helpers.WithStoredWarnings(a.StoredWarnings()), helpers.WithConfigSetting(func(c *core.Config) { c.MonitorRequestedUpscaleValidPeriod = duration("6s") // Override this for consistency @@ -886,6 +896,10 @@ func TestRequestedUpscale(t *testing.T) { // Have the vm-monitor request upscaling: a.Do(state.Monitor().UpscaleRequested, clock.Now(), api.MoreResources{Cpu: false, Memory: true}) + // Revision advances + expectedRevision.Value = 1 + expectedRevision.Flags = revsource.Immediate | revsource.Upscale + // First need to check with the scheduler plugin to get approval for upscaling: a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("6s")}, // if nothing else happens, requested upscale expires. @@ -893,7 +907,7 @@ func TestRequestedUpscale(t *testing.T) { LastPermit: lo.ToPtr(resForCU(1)), Target: resForCU(2), Metrics: lo.ToPtr(lastMetrics.ToAPI()), - TargetRevision: zeroRev(clock.Now()), + TargetRevision: expectedRevision.WithTime(clock.Now()), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) @@ -919,13 +933,23 @@ func TestRequestedUpscale(t *testing.T) { clockTick() a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + // Update the VM to set current=1 + vmInfo := helpers.CreateVmInfo( + DefaultInitialStateConfig.VM, + helpers.WithCurrentCU(2), + ) + rev := expectedRevision.WithTime(clock.Now()) + vmInfo.CurrentRevision = &rev + a.Do(state.UpdatedVM, vmInfo) + latencyObserver.assert(duration("0.2s"), revsource.Upscale|revsource.Immediate) + // Finally, tell the vm-monitor that it got upscaled: a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.8s")}, // still waiting on plugin tick MonitorUpscale: &core.ActionMonitorUpscale{ Current: resForCU(1), Target: resForCU(2), - TargetRevision: zeroRev(clock.Now()), + TargetRevision: expectedRevision.WithTime(clock.Now()), }, }) a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) @@ -947,7 +971,7 @@ func TestRequestedUpscale(t *testing.T) { LastPermit: lo.ToPtr(resForCU(2)), Target: resForCU(2), Metrics: lo.ToPtr(lastMetrics.ToAPI()), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(clock.Now()), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) @@ -965,12 +989,15 @@ func TestRequestedUpscale(t *testing.T) { Wait: &core.ActionWait{Duration: duration("0.9s")}, }) clock.Inc(duration("0.9s")) + // Upscale expired, revision advances + expectedRevision.Value = 2 + expectedRevision.Flags = revsource.Downscale a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4s")}, // now, waiting on plugin request tick MonitorDownscale: &core.ActionMonitorDownscale{ Current: resForCU(2), Target: resForCU(1), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(clock.Now()), }, }) } @@ -1352,7 +1379,7 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) { a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(3)) clockTick() a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - // Do vm-monitor upscale request + // Do vm-monitor upscale requestßß a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.8s")}, MonitorUpscale: &core.ActionMonitorUpscale{ From 62bb9520f46f480a971f9252cef6fb922cdf717d Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 15 Jul 2024 18:15:00 +0400 Subject: [PATCH 33/57] fix comment Signed-off-by: Oleg Vasilev --- pkg/agent/core/state_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index c74f3a8f7..f4d1e7294 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -1379,7 +1379,7 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) { a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(3)) clockTick() a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - // Do vm-monitor upscale requestßß + // Do vm-monitor upscale request a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.8s")}, MonitorUpscale: &core.ActionMonitorUpscale{ From a6e578142002d49e9ed8f44cc66b10df328fb611 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Tue, 16 Jul 2024 15:18:58 +0400 Subject: [PATCH 34/57] add revisions e2e test Signed-off-by: Oleg Vasilev --- .../00-assert.yaml | 0 .../00-create-vm.yaml | 0 .../01-assert.yaml | 0 .../01-upscale.yaml | 0 .../02-assert.yaml | 0 .../02-downscale.yaml | 0 .../e2e/autoscaling.revisions/00-assert.yaml | 21 ++++ .../autoscaling.revisions/00-create-vm.yaml | 105 ++++++++++++++++++ .../e2e/autoscaling.revisions/01-assert.yaml | 34 ++++++ .../e2e/autoscaling.revisions/01-upscale.yaml | 49 ++++++++ 10 files changed, 209 insertions(+) rename tests/e2e/{autoscaling => autoscaling.default}/00-assert.yaml (100%) rename tests/e2e/{autoscaling => autoscaling.default}/00-create-vm.yaml (100%) rename tests/e2e/{autoscaling => autoscaling.default}/01-assert.yaml (100%) rename tests/e2e/{autoscaling => autoscaling.default}/01-upscale.yaml (100%) rename tests/e2e/{autoscaling => autoscaling.default}/02-assert.yaml (100%) rename tests/e2e/{autoscaling => autoscaling.default}/02-downscale.yaml (100%) create mode 100644 tests/e2e/autoscaling.revisions/00-assert.yaml create mode 100644 tests/e2e/autoscaling.revisions/00-create-vm.yaml create mode 100644 tests/e2e/autoscaling.revisions/01-assert.yaml create mode 100644 tests/e2e/autoscaling.revisions/01-upscale.yaml diff --git a/tests/e2e/autoscaling/00-assert.yaml b/tests/e2e/autoscaling.default/00-assert.yaml similarity index 100% rename from tests/e2e/autoscaling/00-assert.yaml rename to tests/e2e/autoscaling.default/00-assert.yaml diff --git a/tests/e2e/autoscaling/00-create-vm.yaml b/tests/e2e/autoscaling.default/00-create-vm.yaml similarity index 100% rename from tests/e2e/autoscaling/00-create-vm.yaml rename to tests/e2e/autoscaling.default/00-create-vm.yaml diff --git a/tests/e2e/autoscaling/01-assert.yaml b/tests/e2e/autoscaling.default/01-assert.yaml similarity index 100% rename from tests/e2e/autoscaling/01-assert.yaml rename to tests/e2e/autoscaling.default/01-assert.yaml diff --git a/tests/e2e/autoscaling/01-upscale.yaml b/tests/e2e/autoscaling.default/01-upscale.yaml similarity index 100% rename from tests/e2e/autoscaling/01-upscale.yaml rename to tests/e2e/autoscaling.default/01-upscale.yaml diff --git a/tests/e2e/autoscaling/02-assert.yaml b/tests/e2e/autoscaling.default/02-assert.yaml similarity index 100% rename from tests/e2e/autoscaling/02-assert.yaml rename to tests/e2e/autoscaling.default/02-assert.yaml diff --git a/tests/e2e/autoscaling/02-downscale.yaml b/tests/e2e/autoscaling.default/02-downscale.yaml similarity index 100% rename from tests/e2e/autoscaling/02-downscale.yaml rename to tests/e2e/autoscaling.default/02-downscale.yaml diff --git a/tests/e2e/autoscaling.revisions/00-assert.yaml b/tests/e2e/autoscaling.revisions/00-assert.yaml new file mode 100644 index 000000000..b6bdbee5c --- /dev/null +++ b/tests/e2e/autoscaling.revisions/00-assert.yaml @@ -0,0 +1,21 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 90 +--- +apiVersion: vm.neon.tech/v1 +kind: VirtualMachine +metadata: + name: example +status: + phase: Running + restartCount: 0 + conditions: + - type: Available + status: "True" + cpus: 250m + memorySize: 1Gi + memoryProvider: DIMMSlots + currentRevision: + revision: + value: 123 + flags: 456 diff --git a/tests/e2e/autoscaling.revisions/00-create-vm.yaml b/tests/e2e/autoscaling.revisions/00-create-vm.yaml new file mode 100644 index 000000000..9b480f41c --- /dev/null +++ b/tests/e2e/autoscaling.revisions/00-create-vm.yaml @@ -0,0 +1,105 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +unitTest: false +--- +apiVersion: v1 +kind: Service +metadata: + name: example +spec: + ports: + - name: postgres + port: 5432 + protocol: TCP + targetPort: postgres + type: NodePort + selector: + vm.neon.tech/name: example +--- +apiVersion: vm.neon.tech/v1 +kind: VirtualMachine +metadata: + name: example + labels: + autoscaling.neon.tech/enabled: "true" + annotations: + autoscaling.neon.tech/bounds: '{ "min": { "cpu": "250m", "mem": "1Gi" }, "max": {"cpu": 1, "mem": "4Gi" } }' +spec: + schedulerName: autoscale-scheduler + targetRevision: + revision: # Just arbitrary values + value: 123 + flags: 456 + updatedAt: 2006-01-02T15:04:05Z + guest: + cpus: + min: 0.25 + max: 1.25 # set value greater than bounds so our tests check we don't exceed the bounds. + use: 0.25 + memorySlotSize: 1Gi + memorySlots: + min: 1 + max: 5 + use: 1 + rootDisk: + image: vm-postgres:15-bullseye + size: 8Gi + args: + - -c + - 'config_file=/etc/postgresql/postgresql.conf' + env: + # for testing only - allows login without password + - name: POSTGRES_HOST_AUTH_METHOD + value: trust + ports: + - name: postgres + port: 5432 + - name: host-metrics + port: 9100 + - name: monitor + port: 10301 + extraNetwork: + enable: true + disks: + - name: pgdata + mountPath: /var/lib/postgresql + emptyDisk: + size: 16Gi + - name: postgres-config + mountPath: /etc/postgresql + configMap: + name: example-config + items: + - key: postgresql.conf + path: postgresql.conf + - name: cache + mountPath: /neonvm/cache + tmpfs: + size: 1Gi + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: example-config +data: + postgresql.conf: | + listen_addresses = '*' + shared_preload_libraries = 'pg_stat_statements' + + max_connections = 64 + shared_buffers = 256MB + effective_cache_size = 1536MB + maintenance_work_mem = 128MB + checkpoint_completion_target = 0.9 + wal_buffers = 16MB + default_statistics_target = 100 + random_page_cost = 1.1 + effective_io_concurrency = 200 + work_mem = 4MB + min_wal_size = 1GB + max_wal_size = 4GB + max_worker_processes = 4 + max_parallel_workers_per_gather = 2 + max_parallel_workers = 4 + max_parallel_maintenance_workers = 2 diff --git a/tests/e2e/autoscaling.revisions/01-assert.yaml b/tests/e2e/autoscaling.revisions/01-assert.yaml new file mode 100644 index 000000000..b3e818c36 --- /dev/null +++ b/tests/e2e/autoscaling.revisions/01-assert.yaml @@ -0,0 +1,34 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 90 +--- +apiVersion: vm.neon.tech/v1 +kind: VirtualMachine +metadata: + name: example +spec: + targetRevision: + # Note that revision goes backward, compared with the previous step. + # This is intentional, this will happen each time autoscaler-agent restarts. + revision: + value: 1 + flags: 1 # 1 for Upscale +status: + phase: Running + restartCount: 0 + conditions: + - type: Available + status: "True" + cpus: 1 + memorySize: 4Gi + currentRevision: # Already propagated from above + revision: + value: 1 + flags: 1 +--- +apiVersion: v1 +kind: pod +metadata: + name: workload +status: + phase: Running diff --git a/tests/e2e/autoscaling.revisions/01-upscale.yaml b/tests/e2e/autoscaling.revisions/01-upscale.yaml new file mode 100644 index 000000000..d95636340 --- /dev/null +++ b/tests/e2e/autoscaling.revisions/01-upscale.yaml @@ -0,0 +1,49 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +unitTest: false +--- +apiVersion: v1 +kind: Pod +metadata: + name: workload +spec: + terminationGracePeriodSeconds: 1 + initContainers: + - name: wait-for-pg + image: postgres:15-bullseye + command: + - sh + - "-c" + - | + set -e + until pg_isready --username=postgres --dbname=postgres --host=example --port=5432; do + sleep 1 + done + containers: + - name: pgbench + image: postgres:15-bullseye + volumeMounts: + - name: my-volume + mountPath: /etc/misc + command: + - pgbench + args: + - postgres://postgres@example:5432/postgres + - --client=20 + - --progress=1 + - --progress-timestamp + - --time=600 + - --file=/etc/misc/query.sql + volumes: + - name: my-volume + configMap: + name: query + restartPolicy: Never +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: query +data: + query.sql: | + select length(factorial(length(factorial(1223)::text)/2)::text); From 63891056d5577580614936ba7afb8aade912d612 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Tue, 16 Jul 2024 16:07:32 +0400 Subject: [PATCH 35/57] rollback accidental changes Signed-off-by: Oleg Vasilev --- pkg/agent/core/state.go | 7 +------ pkg/agent/core/testhelpers/assert.go | 5 ----- pkg/agent/core/testhelpers/construct.go | 1 - pkg/agent/executor/core.go | 6 +----- 4 files changed, 2 insertions(+), 17 deletions(-) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 49feef993..c330c8096 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -331,12 +331,7 @@ func (s *state) nextActions(now time.Time) ActionSet { // Requests to vm-monitor (downscaling) plannedUpscale := actions.MonitorUpscale != nil var monitorDownscaleRequiredWait *time.Duration - actions.MonitorDownscale, monitorDownscaleRequiredWait = - s.calculateMonitorDownscaleAction( - now, - desiredResources, - plannedUpscale, - ) + actions.MonitorDownscale, monitorDownscaleRequiredWait = s.calculateMonitorDownscaleAction(now, desiredResources, plannedUpscale) // --- and that's all the request types! --- diff --git a/pkg/agent/core/testhelpers/assert.go b/pkg/agent/core/testhelpers/assert.go index 19ecdb228..2fb5bf288 100644 --- a/pkg/agent/core/testhelpers/assert.go +++ b/pkg/agent/core/testhelpers/assert.go @@ -79,11 +79,6 @@ func (a Assert) Call(f any, args ...any) PreparedFunctionCall { var argValues []reflect.Value for _, a := range args { - if _, ok := a.(reflect.Value); ok { - // This is a SafeVal value, so we can just use it directly - argValues = append(argValues, a.(reflect.Value)) - continue - } argValues = append(argValues, reflect.ValueOf(a)) } diff --git a/pkg/agent/core/testhelpers/construct.go b/pkg/agent/core/testhelpers/construct.go index eb67a65f5..ca5b21703 100644 --- a/pkg/agent/core/testhelpers/construct.go +++ b/pkg/agent/core/testhelpers/construct.go @@ -38,7 +38,6 @@ type VmInfoOpt interface { func CreateInitialState(config InitialStateConfig, opts ...InitialStateOpt) *core.State { vmOpts := []VmInfoOpt{} - for _, o := range opts { if vo, ok := o.(VmInfoOpt); ok { vmOpts = append(vmOpts, vo) diff --git a/pkg/agent/executor/core.go b/pkg/agent/executor/core.go index ba9f5e9d8..ef8a22a22 100644 --- a/pkg/agent/executor/core.go +++ b/pkg/agent/executor/core.go @@ -53,11 +53,7 @@ type ClientSet struct { Monitor MonitorInterface } -func NewExecutorCore( - stateLogger *zap.Logger, - vm api.VmInfo, - config Config, -) *ExecutorCore { +func NewExecutorCore(stateLogger *zap.Logger, vm api.VmInfo, config Config) *ExecutorCore { return &ExecutorCore{ mu: sync.Mutex{}, stateLogger: stateLogger, From 76df85be7962bf04e8aed3dda070b4e37273596b Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Tue, 16 Jul 2024 16:08:03 +0400 Subject: [PATCH 36/57] tmp: use expectedRevision in tests Signed-off-by: Oleg Vasilev --- pkg/agent/core/state_test.go | 167 +++++++++++++------------ pkg/agent/core/testhelpers/revision.go | 22 ++++ 2 files changed, 109 insertions(+), 80 deletions(-) create mode 100644 pkg/agent/core/testhelpers/revision.go diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index f4d1e7294..ae9ad04bc 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -291,6 +291,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { clockTick := func() helpers.Elapsed { return clock.Inc(100 * time.Millisecond) } + expectedRevision := helpers.NewExpectedRevision(clock.Now) resForCU := DefaultComputeUnit.Mul cfg := DefaultInitialStateConfig @@ -298,7 +299,6 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { defer latencyObserver.assertEmpty() cfg.Core.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) - expectedRevision := vmv1.ZeroRevision state := helpers.CreateInitialState( cfg, helpers.WithStoredWarnings(a.StoredWarnings()), @@ -328,8 +328,8 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { // scale-up would be a good idea, we should be contacting the scheduler to get approval. // Revision advances. - expectedRevision.Value = 1 - expectedRevision.Flags = revsource.Upscale + expectedRevision.Revision.Value = 1 + expectedRevision.Revision.Flags = revsource.Upscale // We should be contacting the scheduler to get approval. a.Call(nextActions).Equals(core.ActionSet{ @@ -337,7 +337,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { LastPermit: lo.ToPtr(resForCU(1)), Target: resForCU(2), Metrics: lo.ToPtr(lastMetrics.ToAPI()), - TargetRevision: expectedRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) // start the request: @@ -345,7 +345,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { clockTick().AssertEquals(duration("0.3s")) // should have nothing more to do; waiting on plugin request to come back a.Call(nextActions).Equals(core.ActionSet{}) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(2), Migrate: nil, }) @@ -359,7 +359,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(1), Target: resForCU(2), - TargetRevision: expectedRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) // start the request: @@ -378,7 +378,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { vmInfo := helpers.CreateVmInfo( DefaultInitialStateConfig.VM, ) - rev := expectedRevision.WithTime(clock.Now()) + rev := expectedRevision.WithTime() vmInfo.CurrentRevision = &rev a.Do(state.UpdatedVM, vmInfo) @@ -391,7 +391,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { MonitorUpscale: &core.ActionMonitorUpscale{ Current: resForCU(1), Target: resForCU(2), - TargetRevision: expectedRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) // start the request: @@ -432,7 +432,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { MonitorDownscale: &core.ActionMonitorDownscale{ Current: resForCU(2), Target: resForCU(1), - TargetRevision: expectedRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) @@ -441,7 +441,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.5s")}, }) - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), expectedRevision.WithTime(clock.Now())) + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), expectedRevision.WithTime()) // After getting approval from the vm-monitor, we make the request to NeonVM to carry it out a.Call(nextActions).Equals(core.ActionSet{ @@ -449,7 +449,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(2), Target: resForCU(1), - TargetRevision: expectedRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) @@ -470,7 +470,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { ) // This means that the NeonVM has applied the changes. - rev = expectedRevision.WithTime(clock.Now()) + rev = expectedRevision.WithTime() vmInfo.CurrentRevision = &rev a.Do(state.UpdatedVM, vmInfo) @@ -483,7 +483,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { LastPermit: lo.ToPtr(resForCU(2)), Target: resForCU(1), Metrics: lo.ToPtr(lastMetrics.ToAPI()), - TargetRevision: expectedRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, // shouldn't have anything to say to the other components }) @@ -491,7 +491,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { clockTick().AssertEquals(duration("1s")) // should have nothing more to do; waiting on plugin request to come back a.Call(nextActions).Equals(core.ActionSet{}) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(1), Migrate: nil, }) @@ -506,6 +506,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { func TestPeriodicPluginRequest(t *testing.T) { a := helpers.NewAssert(t) clock := helpers.NewFakeClock(t) + expectedRevision := helpers.NewExpectedRevision(clock.Now) cfg := DefaultInitialStateConfig latencyObserver := &latencyObserver{t: t, observations: nil} @@ -550,7 +551,7 @@ func TestPeriodicPluginRequest(t *testing.T) { }) clock.Inc(clockTick) } else { - target := vmv1.ZeroRevision.WithTime(clock.Now()) + target := expectedRevision.WithTime() a.Call(state.NextActions, clock.Now()).Equals(core.ActionSet{ PluginRequest: &core.ActionPluginRequest{ LastPermit: &resources, @@ -582,6 +583,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { clockTick := func() { clock.Inc(clockTickDuration) } + expectedRevision := helpers.NewExpectedRevision(clock.Now) resForCU := DefaultComputeUnit.Mul state := helpers.CreateInitialState( @@ -641,12 +643,12 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { MonitorDownscale: &core.ActionMonitorDownscale{ Current: resForCU(6), Target: resForCU(5), - TargetRevision: zeroRev(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(5)) clockTick() - a.Do(state.Monitor().DownscaleRequestDenied, clock.Now(), zeroRev(clock.Now())) + a.Do(state.Monitor().DownscaleRequestDenied, clock.Now(), expectedRevision.WithTime()) // At the end, we should be waiting to retry downscaling: a.Call(nextActions).Equals(core.ActionSet{ @@ -664,7 +666,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { expectedNeonVMRequest = &core.ActionNeonVMRequest{ Current: resForCU(6), Target: resForCU(cu + 1), - TargetRevision: zeroRev(clock.Now()), + TargetRevision: expectedRevision.WithTime(), } } @@ -673,7 +675,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { MonitorDownscale: &core.ActionMonitorDownscale{ Current: resForCU(cu + 1), Target: resForCU(cu), - TargetRevision: zeroRev(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, NeonVMRequest: expectedNeonVMRequest, }) @@ -685,9 +687,9 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { clockTick() currentPluginWait -= clockTickDuration if cu >= 3 /* allow down to 3 */ { - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), zeroRev(clock.Now())) + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), expectedRevision.WithTime()) } else { - a.Do(state.Monitor().DownscaleRequestDenied, clock.Now(), zeroRev(clock.Now())) + a.Do(state.Monitor().DownscaleRequestDenied, clock.Now(), expectedRevision.WithTime()) } } // At this point, waiting 3.7s for next attempt to downscale below 3 CU (last request was @@ -698,7 +700,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(6), Target: resForCU(3), - TargetRevision: zeroRev(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) // Make the request: @@ -716,7 +718,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { LastPermit: lo.ToPtr(resForCU(6)), Target: resForCU(3), Metrics: lo.ToPtr(metrics.ToAPI()), - TargetRevision: zeroRev(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) @@ -724,7 +726,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { Wait: &core.ActionWait{Duration: duration("3.9s")}, }) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(3), Migrate: nil, }) @@ -742,12 +744,12 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { MonitorDownscale: &core.ActionMonitorDownscale{ Current: resForCU(3), Target: resForCU(2), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(2)) clockTick() - a.Do(state.Monitor().DownscaleRequestDenied, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now())) + a.Do(state.Monitor().DownscaleRequestDenied, clock.Now(), expectedRevision.WithTime()) // At the end, we should be waiting to retry downscaling (but actually, the regular plugin // request is coming up sooner). a.Call(nextActions).Equals(core.ActionSet{ @@ -761,7 +763,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { LastPermit: lo.ToPtr(resForCU(3)), Target: resForCU(3), Metrics: lo.ToPtr(metrics.ToAPI()), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) @@ -769,7 +771,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { Wait: &core.ActionWait{Duration: duration("1s")}, // still waiting on retrying vm-monitor downscaling }) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(3), Migrate: nil, }) @@ -787,7 +789,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { expectedNeonVMRequest = &core.ActionNeonVMRequest{ Current: resForCU(3), Target: resForCU(cu + 1), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), } } @@ -796,7 +798,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { MonitorDownscale: &core.ActionMonitorDownscale{ Current: resForCU(cu + 1), Target: resForCU(cu), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, NeonVMRequest: expectedNeonVMRequest, }) @@ -807,7 +809,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { }) clockTick() currentPluginWait -= clockTickDuration - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now())) + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), expectedRevision.WithTime()) } // Still waiting on plugin request tick, but we can make a NeonVM request to enact the // downscaling right away ! @@ -816,7 +818,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(3), Target: resForCU(1), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.NeonVM().StartingRequest, time.Now(), resForCU(1)) @@ -831,7 +833,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { LastPermit: lo.ToPtr(resForCU(3)), Target: resForCU(1), Metrics: lo.ToPtr(metrics.ToAPI()), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) @@ -839,7 +841,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { // not waiting on anything! }) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(1), Migrate: nil, }) @@ -857,6 +859,7 @@ func TestRequestedUpscale(t *testing.T) { clockTick := func() { clock.Inc(100 * time.Millisecond) } + expectedRevision := helpers.NewExpectedRevision(clock.Now) resForCU := DefaultComputeUnit.Mul cfg := DefaultInitialStateConfig @@ -864,7 +867,6 @@ func TestRequestedUpscale(t *testing.T) { defer latencyObserver.assertEmpty() cfg.Core.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) - expectedRevision := vmv1.ZeroRevision state := helpers.CreateInitialState( cfg, helpers.WithStoredWarnings(a.StoredWarnings()), @@ -907,7 +909,7 @@ func TestRequestedUpscale(t *testing.T) { LastPermit: lo.ToPtr(resForCU(1)), Target: resForCU(2), Metrics: lo.ToPtr(lastMetrics.ToAPI()), - TargetRevision: expectedRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) @@ -915,7 +917,7 @@ func TestRequestedUpscale(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("5.9s")}, // same waiting for requested upscale expiring }) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(2), Migrate: nil, }) @@ -926,7 +928,7 @@ func TestRequestedUpscale(t *testing.T) { NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(1), Target: resForCU(2), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) @@ -938,7 +940,7 @@ func TestRequestedUpscale(t *testing.T) { DefaultInitialStateConfig.VM, helpers.WithCurrentCU(2), ) - rev := expectedRevision.WithTime(clock.Now()) + rev := expectedRevision.WithTime() vmInfo.CurrentRevision = &rev a.Do(state.UpdatedVM, vmInfo) latencyObserver.assert(duration("0.2s"), revsource.Upscale|revsource.Immediate) @@ -949,7 +951,7 @@ func TestRequestedUpscale(t *testing.T) { MonitorUpscale: &core.ActionMonitorUpscale{ Current: resForCU(1), Target: resForCU(2), - TargetRevision: expectedRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) @@ -971,7 +973,7 @@ func TestRequestedUpscale(t *testing.T) { LastPermit: lo.ToPtr(resForCU(2)), Target: resForCU(2), Metrics: lo.ToPtr(lastMetrics.ToAPI()), - TargetRevision: expectedRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) @@ -979,7 +981,7 @@ func TestRequestedUpscale(t *testing.T) { a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("0.9s")}, // waiting for requested upscale expiring }) - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(2), Migrate: nil, }) @@ -997,7 +999,7 @@ func TestRequestedUpscale(t *testing.T) { MonitorDownscale: &core.ActionMonitorDownscale{ Current: resForCU(2), Target: resForCU(1), - TargetRevision: expectedRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) } @@ -1010,7 +1012,6 @@ func TestRequestedUpscale(t *testing.T) { func TestDownscalePivotBack(t *testing.T) { a := helpers.NewAssert(t) var clock *helpers.FakeClock - clockTickDuration := duration("0.1s") clockTick := func() helpers.Elapsed { return clock.Inc(clockTickDuration) @@ -1018,6 +1019,7 @@ func TestDownscalePivotBack(t *testing.T) { halfClockTick := func() helpers.Elapsed { return clock.Inc(clockTickDuration / 2) } + var expectedRevision *helpers.ExpectedRevision resForCU := DefaultComputeUnit.Mul var state *core.State @@ -1048,7 +1050,7 @@ func TestDownscalePivotBack(t *testing.T) { Current: resForCU(2), Target: resForCU(1), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) @@ -1057,7 +1059,7 @@ func TestDownscalePivotBack(t *testing.T) { halfClockTick() *pluginWait -= clockTickDuration t.Log(" > finish vm-monitor downscale") - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now())) + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), expectedRevision.WithTime()) }, post: func(pluginWait *time.Duration) { t.Log(" > start vm-monitor upscale") @@ -1066,7 +1068,7 @@ func TestDownscalePivotBack(t *testing.T) { MonitorUpscale: &core.ActionMonitorUpscale{ Current: resForCU(1), Target: resForCU(2), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(2)) @@ -1085,7 +1087,7 @@ func TestDownscalePivotBack(t *testing.T) { NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(2), Target: resForCU(1), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) @@ -1103,7 +1105,7 @@ func TestDownscalePivotBack(t *testing.T) { NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(1), Target: resForCU(2), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) @@ -1122,7 +1124,7 @@ func TestDownscalePivotBack(t *testing.T) { LastPermit: lo.ToPtr(resForCU(2)), Target: resForCU(1), Metrics: lo.ToPtr(initialMetrics.ToAPI()), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) @@ -1131,7 +1133,7 @@ func TestDownscalePivotBack(t *testing.T) { halfClockTick() *pluginWait = duration("4.9s") // reset because we just made a request t.Log(" > finish plugin downscale") - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(1), Migrate: nil, }) @@ -1143,14 +1145,14 @@ func TestDownscalePivotBack(t *testing.T) { LastPermit: lo.ToPtr(resForCU(1)), Target: resForCU(2), Metrics: lo.ToPtr(newMetrics.ToAPI()), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) clockTick() *pluginWait = duration("4.9s") // reset because we just made a request t.Log(" > finish plugin upscale") - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(2), Migrate: nil, }) @@ -1163,6 +1165,7 @@ func TestDownscalePivotBack(t *testing.T) { // Initial setup clock = helpers.NewFakeClock(t) + expectedRevision = helpers.NewExpectedRevision(clock.Now) state = helpers.CreateInitialState( DefaultInitialStateConfig, helpers.WithStoredWarnings(a.StoredWarnings()), @@ -1211,6 +1214,7 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { clockTick := func() { clock.Inc(100 * time.Millisecond) } + expectedRevision := helpers.NewExpectedRevision(clock.Now) resForCU := DefaultComputeUnit.Mul state := helpers.CreateInitialState( @@ -1262,12 +1266,12 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { Current: resForCU(2), Target: resForCU(1), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(1)) clockTick() - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now())) + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), expectedRevision.WithTime()) // Do NeonVM request for that downscaling a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.6s")}, @@ -1275,7 +1279,7 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { Current: resForCU(2), Target: resForCU(1), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(1)) @@ -1287,12 +1291,12 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { LastPermit: lo.ToPtr(resForCU(2)), Target: resForCU(1), Metrics: lo.ToPtr(metrics.ToAPI()), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(1), Migrate: nil, }) @@ -1310,6 +1314,7 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) { clockTick := func() { clock.Inc(100 * time.Millisecond) } + expectedRevision := helpers.NewExpectedRevision(clock.Now) resForCU := DefaultComputeUnit.Mul state := helpers.CreateInitialState( @@ -1358,12 +1363,12 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) { LastPermit: lo.ToPtr(resForCU(2)), Target: resForCU(3), Metrics: lo.ToPtr(metrics.ToAPI()), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(3), Migrate: nil, }) @@ -1373,7 +1378,7 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) { NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(2), Target: resForCU(3), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(3)) @@ -1385,7 +1390,7 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) { MonitorUpscale: &core.ActionMonitorUpscale{ Current: resForCU(2), Target: resForCU(3), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(3)) @@ -1404,6 +1409,7 @@ func TestFailedRequestRetry(t *testing.T) { clockTick := func() { clock.Inc(100 * time.Millisecond) } + expectedRevision := helpers.NewExpectedRevision(clock.Now) resForCU := DefaultComputeUnit.Mul state := helpers.CreateInitialState( @@ -1440,7 +1446,7 @@ func TestFailedRequestRetry(t *testing.T) { LastPermit: lo.ToPtr(resForCU(1)), Target: resForCU(2), Metrics: lo.ToPtr(metrics.ToAPI()), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) @@ -1460,12 +1466,12 @@ func TestFailedRequestRetry(t *testing.T) { LastPermit: lo.ToPtr(resForCU(1)), Target: resForCU(2), Metrics: lo.ToPtr(metrics.ToAPI()), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(2), Migrate: nil, }) @@ -1477,7 +1483,7 @@ func TestFailedRequestRetry(t *testing.T) { NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(1), Target: resForCU(2), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) @@ -1496,7 +1502,7 @@ func TestFailedRequestRetry(t *testing.T) { NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(1), Target: resForCU(2), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(2)) @@ -1509,7 +1515,7 @@ func TestFailedRequestRetry(t *testing.T) { MonitorUpscale: &core.ActionMonitorUpscale{ Current: resForCU(1), Target: resForCU(2), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) } @@ -1524,6 +1530,7 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { clockTick := func() { clock.Inc(100 * time.Millisecond) } + expectedRevision := helpers.NewExpectedRevision(clock.Now) resForCU := DefaultComputeUnit.Mul state := helpers.CreateInitialState( @@ -1548,12 +1555,12 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { LastPermit: nil, Target: resForCU(3), Metrics: nil, - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(3)) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(3), Migrate: nil, }) @@ -1568,7 +1575,7 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { MonitorDownscale: &core.ActionMonitorDownscale{ Current: resForCU(3), Target: resForCU(2), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Monitor().StartingDownscaleRequest, clock.Now(), resForCU(2)) @@ -1593,18 +1600,18 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { // When the vm-monitor request finishes, we want to both // (a) request additional downscaling from vm-monitor, and // (b) make a NeonVM request for the initially approved downscaling - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now())) + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), expectedRevision.WithTime()) a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.6s")}, // plugin request tick wait NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(3), Target: resForCU(2), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, MonitorDownscale: &core.ActionMonitorDownscale{ Current: resForCU(2), Target: resForCU(1), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) // Start both requests. The vm-monitor request will finish first, but after that we'll just be @@ -1614,7 +1621,7 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { clockTick() - a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now())) + a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), expectedRevision.WithTime()) a. WithWarnings( "Wanted to make a request to NeonVM API, but there's already NeonVM request (for different resources) ongoing", @@ -1638,12 +1645,12 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { LastPermit: lo.ToPtr(resForCU(3)), Target: resForCU(2), Metrics: lo.ToPtr(metrics.ToAPI()), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, NeonVMRequest: &core.ActionNeonVMRequest{ Current: resForCU(2), Target: resForCU(1), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(2)) @@ -1651,7 +1658,7 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(2), Migrate: nil, }) @@ -1670,14 +1677,14 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { LastPermit: lo.ToPtr(resForCU(2)), Target: resForCU(1), Metrics: lo.ToPtr(metrics.ToAPI()), - TargetRevision: vmv1.ZeroRevision.WithTime(clock.Now()), + TargetRevision: expectedRevision.WithTime(), }, }) a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(1)) clockTick() - a.NoError(state.Plugin().RequestSuccessful, clock.Now(), vmv1.ZeroRevision.WithTime(clock.Now()), api.PluginResponse{ + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), expectedRevision.WithTime(), api.PluginResponse{ Permit: resForCU(1), Migrate: nil, }) diff --git a/pkg/agent/core/testhelpers/revision.go b/pkg/agent/core/testhelpers/revision.go new file mode 100644 index 000000000..4ce56c0d4 --- /dev/null +++ b/pkg/agent/core/testhelpers/revision.go @@ -0,0 +1,22 @@ +package testhelpers + +import ( + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" + "time" +) + +type ExpectedRevision struct { + vmv1.Revision + Now func() time.Time +} + +func NewExpectedRevision(now func() time.Time) *ExpectedRevision { + return &ExpectedRevision{ + Now: now, + Revision: vmv1.ZeroRevision, + } +} + +func (e *ExpectedRevision) WithTime() vmv1.RevisionWithTime { + return e.Revision.WithTime(e.Now()) +} From 0908e1b1948779b82c99f707b9766cb597d104fe Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Tue, 16 Jul 2024 16:14:00 +0400 Subject: [PATCH 37/57] incremental cleanup Signed-off-by: Oleg Vasilev --- pkg/agent/core/state_test.go | 29 +++++++++++--------------- pkg/agent/core/testhelpers/revision.go | 3 ++- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index ae9ad04bc..403e16c2c 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -253,10 +253,6 @@ func duration(s string) time.Duration { return d } -func zeroRev(t time.Time) vmv1.RevisionWithTime { - return vmv1.ZeroRevision.WithTime(t) -} - type latencyObserver struct { t *testing.T observations []struct { @@ -294,15 +290,15 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { expectedRevision := helpers.NewExpectedRevision(clock.Now) resForCU := DefaultComputeUnit.Mul - cfg := DefaultInitialStateConfig latencyObserver := &latencyObserver{t: t, observations: nil} defer latencyObserver.assertEmpty() - cfg.Core.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) - state := helpers.CreateInitialState( - cfg, + DefaultInitialStateConfig, helpers.WithStoredWarnings(a.StoredWarnings()), helpers.WithTestingLogfWarnings(t), + helpers.WithConfigSetting(func(c *core.Config) { + c.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) + }), ) nextActions := func() core.ActionSet { return state.NextActions(clock.Now()) @@ -508,16 +504,17 @@ func TestPeriodicPluginRequest(t *testing.T) { clock := helpers.NewFakeClock(t) expectedRevision := helpers.NewExpectedRevision(clock.Now) - cfg := DefaultInitialStateConfig latencyObserver := &latencyObserver{t: t, observations: nil} defer latencyObserver.assertEmpty() - // This time, we will test plugin latency - cfg.Core.PromMetricsCallbacks.PluginLatency = latencyObserver.observe - cfg.Core.RevisionSource = revsource.NewRevisionSource(nil) state := helpers.CreateInitialState( - cfg, + DefaultInitialStateConfig, helpers.WithStoredWarnings(a.StoredWarnings()), + helpers.WithConfigSetting(func(c *core.Config) { + c.PromMetricsCallbacks.PluginLatency = latencyObserver.observe + // This time, we will test plugin latency + c.RevisionSource = revsource.NewRevisionSource(nil) + }), ) state.Monitor().Active(true) @@ -862,15 +859,13 @@ func TestRequestedUpscale(t *testing.T) { expectedRevision := helpers.NewExpectedRevision(clock.Now) resForCU := DefaultComputeUnit.Mul - cfg := DefaultInitialStateConfig latencyObserver := &latencyObserver{t: t, observations: nil} defer latencyObserver.assertEmpty() - cfg.Core.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) - state := helpers.CreateInitialState( - cfg, + DefaultInitialStateConfig, helpers.WithStoredWarnings(a.StoredWarnings()), helpers.WithConfigSetting(func(c *core.Config) { + c.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) c.MonitorRequestedUpscaleValidPeriod = duration("6s") // Override this for consistency }), ) diff --git a/pkg/agent/core/testhelpers/revision.go b/pkg/agent/core/testhelpers/revision.go index 4ce56c0d4..b2b28d3c5 100644 --- a/pkg/agent/core/testhelpers/revision.go +++ b/pkg/agent/core/testhelpers/revision.go @@ -1,8 +1,9 @@ package testhelpers import ( - vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "time" + + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" ) type ExpectedRevision struct { From 5b0f93d1f382dad6d01c62bee350591bbf214817 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Tue, 16 Jul 2024 17:21:07 +0400 Subject: [PATCH 38/57] iterative changes Signed-off-by: Oleg Vasilev --- pkg/agent/core/revsource/revsource.go | 20 +++++--------------- pkg/agent/core/state.go | 20 ++++++++++---------- pkg/agent/core/state_test.go | 11 +++-------- pkg/agent/core/testhelpers/revision.go | 10 ++++++++++ pkg/agent/runner.go | 2 +- 5 files changed, 29 insertions(+), 34 deletions(-) diff --git a/pkg/agent/core/revsource/revsource.go b/pkg/agent/core/revsource/revsource.go index fe7ff4107..ea0b34dcf 100644 --- a/pkg/agent/core/revsource/revsource.go +++ b/pkg/agent/core/revsource/revsource.go @@ -37,7 +37,7 @@ func FlagsToLabels(flags vmv1.Flag) []string { // Once RevisionSource observes a previously generated Revision after some time, // the time it took since that Revision was generated. type RevisionSource struct { - cb MetricCB + cb ObserveCallback // The in-flight revisions are stored in-order. // After the revision is observed, it is removed from the measurements, and the offset is increased. @@ -45,7 +45,7 @@ type RevisionSource struct { offset int64 } -func NewRevisionSource(cb MetricCB) *RevisionSource { +func NewRevisionSource(cb ObserveCallback) *RevisionSource { return &RevisionSource{ cb: cb, measurements: nil, @@ -90,19 +90,9 @@ func (c *RevisionSource) Observe(moment time.Time, rev vmv1.Revision) error { return nil } -type NilRevisionSource struct{} +type ObserveCallback func(dur time.Duration, flags vmv1.Flag) -func (c *NilRevisionSource) Next(_ time.Time, _ vmv1.Flag) vmv1.Revision { - return vmv1.Revision{ - Value: 0, - Flags: 0, - } -} -func (c *NilRevisionSource) Observe(_ time.Time, _ vmv1.Revision) error { return nil } - -type MetricCB func(dur time.Duration, flags vmv1.Flag) - -func WrapHistogramVec(hist *prometheus.HistogramVec) MetricCB { +func WrapHistogramVec(hist *prometheus.HistogramVec) ObserveCallback { return func(dur time.Duration, flags vmv1.Flag) { labels := FlagsToLabels(flags) hist.WithLabelValues(labels...).Observe(dur.Seconds()) @@ -115,7 +105,7 @@ func Propagate( now time.Time, target vmv1.RevisionWithTime, currentSlot *vmv1.Revision, - metricCB MetricCB, + metricCB ObserveCallback, ) { if metricCB != nil { diff := now.Sub(target.UpdatedAt.Time) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index c330c8096..3a6e7c455 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -36,10 +36,15 @@ import ( "github.com/neondatabase/autoscaling/pkg/util" ) -type PromMetricsCallbacks struct { - PluginLatency revsource.MetricCB - MonitorLatency revsource.MetricCB - NeonVMLatency revsource.MetricCB +type ObservabilityCallbacks struct { + PluginLatency revsource.ObserveCallback + MonitorLatency revsource.ObserveCallback + NeonVMLatency revsource.ObserveCallback +} + +type RevisionSource interface { + Next(ts time.Time, flags vmv1.Flag) vmv1.Revision + Observe(moment time.Time, rev vmv1.Revision) error } // Config represents some of the static configuration underlying the decision-making of State @@ -85,7 +90,7 @@ type Config struct { RevisionSource RevisionSource `json:"-"` // PromMetricsCallbacks are the callbacks to update the Prometheus metrics. - PromMetricsCallbacks PromMetricsCallbacks `json:"-"` + PromMetricsCallbacks ObservabilityCallbacks `json:"-"` } type LogConfig struct { @@ -225,11 +230,6 @@ func (ns *neonvmState) ongoingRequest() bool { return ns.OngoingRequested != nil } -type RevisionSource interface { - Next(ts time.Time, flags vmv1.Flag) vmv1.Revision - Observe(moment time.Time, rev vmv1.Revision) error -} - func NewState(vm api.VmInfo, config Config) *State { state := &State{ internal: state{ diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 403e16c2c..5c1a2ea04 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -132,7 +132,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { }, }, RevisionSource: revsource.NewRevisionSource(nil), - PromMetricsCallbacks: core.PromMetricsCallbacks{ + PromMetricsCallbacks: core.ObservabilityCallbacks{ PluginLatency: nil, MonitorLatency: nil, NeonVMLatency: nil, @@ -205,8 +205,8 @@ var DefaultInitialStateConfig = helpers.InitialStateConfig{ Info: nil, Warn: nil, }, - RevisionSource: &revsource.NilRevisionSource{}, - PromMetricsCallbacks: core.PromMetricsCallbacks{ + RevisionSource: &helpers.NilRevisionSource{}, + PromMetricsCallbacks: core.ObservabilityCallbacks{ PluginLatency: nil, MonitorLatency: nil, NeonVMLatency: nil, @@ -459,11 +459,6 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { // Request to NeonVM is successful, but let's wait one more tick for // NeonVM to pick up the changes and apply those. clockTick().AssertEquals(duration("0.9s")) - vmInfo = helpers.CreateVmInfo( - DefaultInitialStateConfig.VM, - helpers.WithCurrentCU(1), - helpers.WithMinMaxCU(1, 1), - ) // This means that the NeonVM has applied the changes. rev = expectedRevision.WithTime() diff --git a/pkg/agent/core/testhelpers/revision.go b/pkg/agent/core/testhelpers/revision.go index b2b28d3c5..3c0106b39 100644 --- a/pkg/agent/core/testhelpers/revision.go +++ b/pkg/agent/core/testhelpers/revision.go @@ -21,3 +21,13 @@ func NewExpectedRevision(now func() time.Time) *ExpectedRevision { func (e *ExpectedRevision) WithTime() vmv1.RevisionWithTime { return e.Revision.WithTime(e.Now()) } + +type NilRevisionSource struct{} + +func (c *NilRevisionSource) Next(_ time.Time, _ vmv1.Flag) vmv1.Revision { + return vmv1.Revision{ + Value: 0, + Flags: 0, + } +} +func (c *NilRevisionSource) Observe(_ time.Time, _ vmv1.Revision) error { return nil } diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go index f87c8ce68..9b4fb9325 100644 --- a/pkg/agent/runner.go +++ b/pkg/agent/runner.go @@ -216,7 +216,7 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util Warn: coreExecLogger.Warn, }, RevisionSource: revisionSource, - PromMetricsCallbacks: core.PromMetricsCallbacks{ + PromMetricsCallbacks: core.ObservabilityCallbacks{ PluginLatency: revsource.WrapHistogramVec(&r.global.metrics.pluginLatency), MonitorLatency: revsource.WrapHistogramVec(&r.global.metrics.monitorLatency), NeonVMLatency: revsource.WrapHistogramVec(&r.global.metrics.neonvmLatency), From 0e0b2ca91867ec828a42e3c6c4af5925ef9eb027 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Wed, 17 Jul 2024 15:52:39 +0400 Subject: [PATCH 39/57] prevent unbounded growth Signed-off-by: Oleg Vasilev --- pkg/agent/core/revsource/revsource.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pkg/agent/core/revsource/revsource.go b/pkg/agent/core/revsource/revsource.go index ea0b34dcf..f6d1be722 100644 --- a/pkg/agent/core/revsource/revsource.go +++ b/pkg/agent/core/revsource/revsource.go @@ -32,6 +32,11 @@ func FlagsToLabels(flags vmv1.Flag) []string { return ret } +// MaxRevisions is the maximum number of revisions that can be stored in the RevisionSource. +// This is to prevent memory leaks. +// Upon reaching it, the oldest revisions are discarded. +const MaxRevisions = 100 + // RevisionSource can generate and observe revisions. // Each Revision is a value and a set of flags (for meta-information). // Once RevisionSource observes a previously generated Revision after some time, @@ -63,6 +68,12 @@ func (c *RevisionSource) Next(now time.Time, flags vmv1.Flag) vmv1.Revision { Flags: flags, } c.measurements = append(c.measurements, now) + + if len(c.measurements) > MaxRevisions { + c.measurements = c.measurements[1:] + c.offset++ + } + return ret } From 9facc7dc9f834c8f844b9077018b94cb16967916 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 18 Jul 2024 12:10:53 +0400 Subject: [PATCH 40/57] fix format Signed-off-by: Oleg Vasilev --- pkg/agent/prommetrics.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pkg/agent/prommetrics.go b/pkg/agent/prommetrics.go index f4bd5c998..8a46680ab 100644 --- a/pkg/agent/prommetrics.go +++ b/pkg/agent/prommetrics.go @@ -246,28 +246,32 @@ func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { Name: "autoscaling_agent_scaling_latency_seconds", Help: "End-to-end scaling latency", Buckets: buckets, - }, revsource.AllFlagNames, + }, + revsource.AllFlagNames, )), pluginLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "autoscaling_agent_plugin_latency_seconds", Help: "Plugin request latency", Buckets: buckets, - }, revsource.AllFlagNames, + }, + revsource.AllFlagNames, )), monitorLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "autoscaling_agent_monitor_latency_seconds", Help: "Monitor request latency", Buckets: buckets, - }, revsource.AllFlagNames, + }, + revsource.AllFlagNames, )), neonvmLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "autoscaling_agent_neonvm_latency_seconds", Help: "NeonVM request latency", Buckets: buckets, - }, revsource.AllFlagNames, + }, + revsource.AllFlagNames, )), } From 88675b9d6466da852b6fde56008c956606a2d348 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 18 Jul 2024 16:12:21 +0400 Subject: [PATCH 41/57] one more test Signed-off-by: Oleg Vasilev --- pkg/agent/core/state.go | 24 ++++++++++-------------- pkg/agent/core/state_test.go | 15 +++++++++++++++ 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 3a6e7c455..5f5ed833a 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -474,12 +474,12 @@ func (s *state) calculateNeonVMAction( pluginRequestedPhase string, ) (*ActionNeonVMRequest, *time.Duration) { targetRevision := s.TargetRevision - if desiredResources.HasFieldLessThan(s.VM.Using()) { + if desiredResources.HasFieldLessThan(s.VM.Using()) && s.Monitor.CurrentRevision.Value > 0 { // We are downscaling, so we needed a permit from the monitor targetRevision = targetRevision.Min(s.Monitor.CurrentRevision) } - if desiredResources.HasFieldGreaterThan(s.VM.Using()) { + if desiredResources.HasFieldGreaterThan(s.VM.Using()) && s.Plugin.CurrentRevision.Value > 0 { // We are upscaling, so we needed a permit from the plugin targetRevision = targetRevision.Min(s.Plugin.CurrentRevision) } @@ -868,23 +868,20 @@ func (s *state) updateTargetRevision( immediate bool, ) { if s.LastDesiredResources == nil { - if desired == current { - // First iteration, and no scaling required - return - } - } else { - if *s.LastDesiredResources == desired { - // Nothing changed, so no need to update the target revision - return - } + s.LastDesiredResources = ¤t + } + + if *s.LastDesiredResources == desired { + // Nothing changed, so no need to update the target revision + return } var flags vmv1.Flag - if desired.HasFieldGreaterThan(current) { + if desired.HasFieldGreaterThan(*s.LastDesiredResources) { flags.Set(revsource.Upscale) } - if desired.HasFieldLessThan(current) { + if desired.HasFieldLessThan(*s.LastDesiredResources) { flags.Set(revsource.Downscale) } if immediate { @@ -892,7 +889,6 @@ func (s *state) updateTargetRevision( } s.TargetRevision = s.Config.RevisionSource.Next(now, flags) - } func (s *state) updateNeonVMCurrentRevision(currentRevision vmv1.RevisionWithTime) { diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 5c1a2ea04..235e2da84 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -1010,6 +1010,8 @@ func TestDownscalePivotBack(t *testing.T) { return clock.Inc(clockTickDuration / 2) } var expectedRevision *helpers.ExpectedRevision + latencyObserver := &latencyObserver{t: t, observations: nil} + defer latencyObserver.assertEmpty() resForCU := DefaultComputeUnit.Mul var state *core.State @@ -1052,6 +1054,7 @@ func TestDownscalePivotBack(t *testing.T) { a.Do(state.Monitor().DownscaleRequestAllowed, clock.Now(), expectedRevision.WithTime()) }, post: func(pluginWait *time.Duration) { + expectedRevision.Value = 2 t.Log(" > start vm-monitor upscale") a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: *pluginWait}, @@ -1161,6 +1164,9 @@ func TestDownscalePivotBack(t *testing.T) { helpers.WithStoredWarnings(a.StoredWarnings()), helpers.WithMinMaxCU(1, 3), helpers.WithCurrentCU(2), + helpers.WithConfigSetting(func(c *core.Config) { + c.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) + }), ) state.Monitor().Active(true) @@ -1175,6 +1181,10 @@ func TestDownscalePivotBack(t *testing.T) { a.Call(getDesiredResources, state, clock.Now()). Equals(resForCU(1)) + // We start with downscale + expectedRevision.Value = 1 + expectedRevision.Flags = revsource.Downscale + for j := 0; j <= i; j++ { midRequest := func() {} if j == i { @@ -1184,6 +1194,7 @@ func TestDownscalePivotBack(t *testing.T) { a.Do(state.UpdateSystemMetrics, newMetrics) a.Call(getDesiredResources, state, clock.Now()). Equals(resForCU(2)) + } } @@ -1191,6 +1202,10 @@ func TestDownscalePivotBack(t *testing.T) { } for j := i; j >= 0; j-- { + // Now it is upscale + expectedRevision.Value = 2 + expectedRevision.Flags = revsource.Upscale + steps[j].post(&pluginWait) } } From 49813c73cfb0c83da06a7206f6f3b1844de00750 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 18 Jul 2024 18:18:45 +0400 Subject: [PATCH 42/57] simplify tests Signed-off-by: Oleg Vasilev --- pkg/agent/core/state_test.go | 59 +++++++++++++++++++------ pkg/agent/core/testhelpers/construct.go | 6 +++ 2 files changed, 51 insertions(+), 14 deletions(-) diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 235e2da84..84f5734ba 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -371,12 +371,10 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { // Now NeonVM request is done. a.Do(state.NeonVM().RequestSuccessful, clock.Now()) - vmInfo := helpers.CreateVmInfo( + a.Do(state.UpdatedVM, helpers.CreateVmInfo( DefaultInitialStateConfig.VM, - ) - rev := expectedRevision.WithTime() - vmInfo.CurrentRevision = &rev - a.Do(state.UpdatedVM, vmInfo) + helpers.WithCurrentRevision(expectedRevision.WithTime()), + )) // And we see the latency. We started at 0.2s and finished at 0.4s latencyObserver.assert(duration("0.2s"), revsource.Upscale) @@ -461,9 +459,10 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { clockTick().AssertEquals(duration("0.9s")) // This means that the NeonVM has applied the changes. - rev = expectedRevision.WithTime() - vmInfo.CurrentRevision = &rev - a.Do(state.UpdatedVM, vmInfo) + a.Do(state.UpdatedVM, helpers.CreateVmInfo( + DefaultInitialStateConfig.VM, + helpers.WithCurrentRevision(expectedRevision.WithTime()), + )) // We started at 0.6s and finished at 0.9s. latencyObserver.assert(duration("0.3s"), revsource.Downscale) @@ -926,13 +925,11 @@ func TestRequestedUpscale(t *testing.T) { a.Do(state.NeonVM().RequestSuccessful, clock.Now()) // Update the VM to set current=1 - vmInfo := helpers.CreateVmInfo( + a.Do(state.UpdatedVM, helpers.CreateVmInfo( DefaultInitialStateConfig.VM, helpers.WithCurrentCU(2), - ) - rev := expectedRevision.WithTime() - vmInfo.CurrentRevision = &rev - a.Do(state.UpdatedVM, vmInfo) + helpers.WithCurrentRevision(expectedRevision.WithTime()), + )) latencyObserver.assert(duration("0.2s"), revsource.Upscale|revsource.Immediate) // Finally, tell the vm-monitor that it got upscaled: @@ -1108,6 +1105,21 @@ func TestDownscalePivotBack(t *testing.T) { a.Do(state.NeonVM().RequestSuccessful, clock.Now()) }, }, + // NeonVM propagation + { + pre: func(_ *time.Duration, midAction func()) { + a.Do(state.UpdatedVM, helpers.CreateVmInfo( + DefaultInitialStateConfig.VM, + helpers.WithCurrentCU(1), + helpers.WithCurrentRevision(expectedRevision.WithTime()), + )) + latencyObserver.assert(duration("0.2s"), revsource.Downscale) + midAction() + }, + post: func(_ *time.Duration) { + // No action + }, + }, // plugin requests { pre: func(pluginWait *time.Duration, midRequest func()) { @@ -1220,6 +1232,8 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { clock.Inc(100 * time.Millisecond) } expectedRevision := helpers.NewExpectedRevision(clock.Now) + latencyObserver := &latencyObserver{t: t, observations: nil} + defer latencyObserver.assertEmpty() resForCU := DefaultComputeUnit.Mul state := helpers.CreateInitialState( @@ -1227,6 +1241,9 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { helpers.WithStoredWarnings(a.StoredWarnings()), helpers.WithMinMaxCU(1, 3), helpers.WithCurrentCU(2), + helpers.WithConfigSetting(func(config *core.Config) { + config.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) + }), ) nextActions := func() core.ActionSet { return state.NextActions(clock.Now()) @@ -1263,6 +1280,8 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { )) // We should be making a vm-monitor downscaling request + expectedRevision.Value += 1 + expectedRevision.Flags = revsource.Downscale // TODO: In the future, we should have a "force-downscale" alternative so the vm-monitor doesn't // get to deny the downscaling. a.Call(nextActions).Equals(core.ActionSet{ @@ -1305,9 +1324,21 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { Permit: resForCU(1), Migrate: nil, }) + + // Update the VM to set currentCU==1 CU + clockTick() + a.Do(state.UpdatedVM, helpers.CreateVmInfo( + DefaultInitialStateConfig.VM, + helpers.WithCurrentCU(1), + helpers.WithMinMaxCU(1, 1), + helpers.WithCurrentRevision(expectedRevision.WithTime()), + )) + + latencyObserver.assert(duration("0.4s"), revsource.Downscale) + // And then, we shouldn't need to do anything else: a.Call(nextActions).Equals(core.ActionSet{ - Wait: &core.ActionWait{Duration: duration("4.9s")}, + Wait: &core.ActionWait{Duration: duration("4.8s")}, }) } diff --git a/pkg/agent/core/testhelpers/construct.go b/pkg/agent/core/testhelpers/construct.go index ca5b21703..a97e8a10b 100644 --- a/pkg/agent/core/testhelpers/construct.go +++ b/pkg/agent/core/testhelpers/construct.go @@ -159,3 +159,9 @@ func WithCurrentCU(cu uint16) VmInfoOpt { vm.SetUsing(c.ComputeUnit.Mul(cu)) }) } + +func WithCurrentRevision(rev vmapi.RevisionWithTime) VmInfoOpt { + return vmInfoModifier(func(c InitialVmInfoConfig, vm *api.VmInfo) { + vm.CurrentRevision = &rev + }) +} From 7f398069bc5e296942abb4d8a3a886bed0d5a9c5 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Fri, 19 Jul 2024 11:57:34 +0400 Subject: [PATCH 43/57] fix tiny thing Signed-off-by: Oleg Vasilev --- pkg/agent/core/state_test.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 84f5734ba..aa8c23e1c 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -321,11 +321,9 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { Equals(resForCU(2)) // Now that the initial scheduler request is done, and we have metrics that indicate - // scale-up would be a good idea, we should be contacting the scheduler to get approval. - - // Revision advances. - expectedRevision.Revision.Value = 1 - expectedRevision.Revision.Flags = revsource.Upscale + // scale-up would be a good idea. + expectedRevision.Value = 1 + expectedRevision.Flags = revsource.Upscale // We should be contacting the scheduler to get approval. a.Call(nextActions).Equals(core.ActionSet{ From be700fff26d51998a286eeab8632d357e11d9574 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Fri, 19 Jul 2024 11:59:58 +0400 Subject: [PATCH 44/57] replace boolean values in metrics with direction label Signed-off-by: Oleg Vasilev --- pkg/agent/core/revsource/revsource.go | 27 ------------------- pkg/agent/core/state.go | 14 +++------- pkg/agent/core/state_test.go | 4 +-- pkg/agent/prommetrics.go | 38 ++++++++++++++++++++++----- pkg/agent/runner.go | 8 +++--- 5 files changed, 41 insertions(+), 50 deletions(-) diff --git a/pkg/agent/core/revsource/revsource.go b/pkg/agent/core/revsource/revsource.go index f6d1be722..cbe1d3ab1 100644 --- a/pkg/agent/core/revsource/revsource.go +++ b/pkg/agent/core/revsource/revsource.go @@ -4,34 +4,14 @@ import ( "errors" "time" - "github.com/prometheus/client_golang/prometheus" - vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" ) const ( Upscale vmv1.Flag = 1 << iota Downscale - Immediate ) -// AllFlags and AllFlagNames must have the same order, so the metrics work correctly. -var AllFlags = []vmv1.Flag{Upscale, Downscale, Immediate} -var AllFlagNames = []string{"upscale", "downscale", "immediate"} - -// FlagsToLabels converts a set of flags to a list of strings which prometheus can take. -func FlagsToLabels(flags vmv1.Flag) []string { - var ret []string - for _, flag := range AllFlags { - value := "false" - if flags.Has(flag) { - value = "true" - } - ret = append(ret, value) - } - return ret -} - // MaxRevisions is the maximum number of revisions that can be stored in the RevisionSource. // This is to prevent memory leaks. // Upon reaching it, the oldest revisions are discarded. @@ -103,13 +83,6 @@ func (c *RevisionSource) Observe(moment time.Time, rev vmv1.Revision) error { type ObserveCallback func(dur time.Duration, flags vmv1.Flag) -func WrapHistogramVec(hist *prometheus.HistogramVec) ObserveCallback { - return func(dur time.Duration, flags vmv1.Flag) { - labels := FlagsToLabels(flags) - hist.WithLabelValues(labels...).Observe(dur.Seconds()) - } -} - // Propagate sets the target revision to be current, optionally measuring the time it took // for propagation. func Propagate( diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 5f5ed833a..f794ae88f 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -849,8 +849,10 @@ func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) ( return nil } } - s.updateTargetRevision(now, result, s.VM.Using(), requestedUpscalingAffectedResult) + s.updateTargetRevision(now, result, s.VM.Using()) + // TODO: we are both saving the result into LastDesiredResources and returning it. This is + // redundant, and we should remove one of the two. s.LastDesiredResources = &result s.info("Calculated desired resources", @@ -861,12 +863,7 @@ func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) ( return result, calculateWaitTime } -func (s *state) updateTargetRevision( - now time.Time, - desired api.Resources, - current api.Resources, - immediate bool, -) { +func (s *state) updateTargetRevision(now time.Time, desired api.Resources, current api.Resources) { if s.LastDesiredResources == nil { s.LastDesiredResources = ¤t } @@ -884,9 +881,6 @@ func (s *state) updateTargetRevision( if desired.HasFieldLessThan(*s.LastDesiredResources) { flags.Set(revsource.Downscale) } - if immediate { - flags.Set(revsource.Immediate) - } s.TargetRevision = s.Config.RevisionSource.Next(now, flags) } diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index aa8c23e1c..f72912eb7 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -887,7 +887,7 @@ func TestRequestedUpscale(t *testing.T) { a.Do(state.Monitor().UpscaleRequested, clock.Now(), api.MoreResources{Cpu: false, Memory: true}) // Revision advances expectedRevision.Value = 1 - expectedRevision.Flags = revsource.Immediate | revsource.Upscale + expectedRevision.Flags = revsource.Upscale // First need to check with the scheduler plugin to get approval for upscaling: a.Call(nextActions).Equals(core.ActionSet{ @@ -928,7 +928,7 @@ func TestRequestedUpscale(t *testing.T) { helpers.WithCurrentCU(2), helpers.WithCurrentRevision(expectedRevision.WithTime()), )) - latencyObserver.assert(duration("0.2s"), revsource.Upscale|revsource.Immediate) + latencyObserver.assert(duration("0.2s"), revsource.Upscale) // Finally, tell the vm-monitor that it got upscaled: a.Call(nextActions).Equals(core.ActionSet{ diff --git a/pkg/agent/prommetrics.go b/pkg/agent/prommetrics.go index 8a46680ab..95ae95435 100644 --- a/pkg/agent/prommetrics.go +++ b/pkg/agent/prommetrics.go @@ -1,9 +1,12 @@ package agent import ( + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/collectors" + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" "github.com/neondatabase/autoscaling/pkg/agent/core/revsource" "github.com/neondatabase/autoscaling/pkg/util" ) @@ -52,9 +55,11 @@ type resourceChangePair struct { } const ( - directionLabel = "direction" - directionValueInc = "inc" - directionValueDec = "dec" + directionLabel = "direction" + directionValueInc = "inc" + directionValueDec = "dec" + directionValueBoth = "both" + directionValueNone = "none" ) type runnerMetricState string @@ -247,7 +252,7 @@ func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { Help: "End-to-end scaling latency", Buckets: buckets, }, - revsource.AllFlagNames, + []string{directionLabel}, )), pluginLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec( prometheus.HistogramOpts{ @@ -255,7 +260,7 @@ func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { Help: "Plugin request latency", Buckets: buckets, }, - revsource.AllFlagNames, + []string{directionLabel}, )), monitorLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec( prometheus.HistogramOpts{ @@ -263,7 +268,7 @@ func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { Help: "Monitor request latency", Buckets: buckets, }, - revsource.AllFlagNames, + []string{directionLabel}, )), neonvmLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec( prometheus.HistogramOpts{ @@ -271,7 +276,7 @@ func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { Help: "NeonVM request latency", Buckets: buckets, }, - revsource.AllFlagNames, + []string{directionLabel}, )), } @@ -309,6 +314,25 @@ func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) { return metrics, reg } +func flagsToDirection(flags vmv1.Flag) string { + if flags.Has(revsource.Upscale) && flags.Has(revsource.Downscale) { + return directionValueBoth + } + if flags.Has(revsource.Upscale) { + return directionValueInc + } + if flags.Has(revsource.Downscale) { + return directionValueDec + } + return directionValueNone +} + +func WrapHistogramVec(hist *prometheus.HistogramVec) revsource.ObserveCallback { + return func(dur time.Duration, flags vmv1.Flag) { + hist.WithLabelValues(flagsToDirection(flags)).Observe(dur.Seconds()) + } +} + type PerVMMetrics struct { cpu *prometheus.GaugeVec memory *prometheus.GaugeVec diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go index 9b4fb9325..73af4d819 100644 --- a/pkg/agent/runner.go +++ b/pkg/agent/runner.go @@ -197,7 +197,7 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util coreExecLogger := execLogger.Named("core") revisionSource := revsource.NewRevisionSource( - revsource.WrapHistogramVec(&r.global.metrics.scalingLatency), + WrapHistogramVec(&r.global.metrics.scalingLatency), ) executorCore := executor.NewExecutorCore(coreExecLogger, getVmInfo(), executor.Config{ OnNextActions: r.global.metrics.runnerNextActions.Inc, @@ -217,9 +217,9 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util }, RevisionSource: revisionSource, PromMetricsCallbacks: core.ObservabilityCallbacks{ - PluginLatency: revsource.WrapHistogramVec(&r.global.metrics.pluginLatency), - MonitorLatency: revsource.WrapHistogramVec(&r.global.metrics.monitorLatency), - NeonVMLatency: revsource.WrapHistogramVec(&r.global.metrics.neonvmLatency), + PluginLatency: WrapHistogramVec(&r.global.metrics.pluginLatency), + MonitorLatency: WrapHistogramVec(&r.global.metrics.monitorLatency), + NeonVMLatency: WrapHistogramVec(&r.global.metrics.neonvmLatency), }, }, }) From 9b07933ae5f90da6f8d25d600590f993b6a0aa92 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Fri, 19 Jul 2024 18:01:35 +0400 Subject: [PATCH 45/57] misc renames Signed-off-by: Oleg Vasilev --- neonvm/controllers/vm_controller.go | 2 ++ pkg/agent/core/revsource/revsource.go | 6 +++--- pkg/agent/core/state.go | 15 +++++++++------ pkg/agent/core/state_test.go | 6 +++--- pkg/agent/runner.go | 2 +- 5 files changed, 18 insertions(+), 13 deletions(-) diff --git a/neonvm/controllers/vm_controller.go b/neonvm/controllers/vm_controller.go index 5491e48a2..e5ce1fc80 100644 --- a/neonvm/controllers/vm_controller.go +++ b/neonvm/controllers/vm_controller.go @@ -800,6 +800,8 @@ func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine) // do nothing } + // Propagate TargetRevision to CurrentRevision. This is done only if the VM is fully + // reconciled and running. if vm.Status.Phase == vmv1.VmRunning && vm.Spec.TargetRevision != nil { rev := vm.Spec.TargetRevision.WithTime(time.Now()) vm.Status.CurrentRevision = &rev diff --git a/pkg/agent/core/revsource/revsource.go b/pkg/agent/core/revsource/revsource.go index cbe1d3ab1..dc4a97064 100644 --- a/pkg/agent/core/revsource/revsource.go +++ b/pkg/agent/core/revsource/revsource.go @@ -89,11 +89,11 @@ func Propagate( now time.Time, target vmv1.RevisionWithTime, currentSlot *vmv1.Revision, - metricCB ObserveCallback, + cb ObserveCallback, ) { - if metricCB != nil { + if cb != nil { diff := now.Sub(target.UpdatedAt.Time) - metricCB(diff, target.Flags) + cb(diff, target.Flags) } if currentSlot == nil { return diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index f794ae88f..23a7f4b03 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -89,8 +89,8 @@ type Config struct { // RevisionSource is the source of revisions to track the progress during scaling. RevisionSource RevisionSource `json:"-"` - // PromMetricsCallbacks are the callbacks to update the Prometheus metrics. - PromMetricsCallbacks ObservabilityCallbacks `json:"-"` + // ObservabilityCallbacks are the callbacks to submit datapoints for observability. + ObservabilityCallbacks ObservabilityCallbacks `json:"-"` } type LogConfig struct { @@ -222,6 +222,9 @@ type neonvmState struct { OngoingRequested *api.Resources RequestFailedAt *time.Time + // TargetRevision is the revision agent works towards. Contrary to monitor/plugin, we + // store it not only in action, but also here. This is needed, because for NeonVM propagation + // happens after the changes are actually applied, when the action object is long gone. TargetRevision vmv1.RevisionWithTime CurrentRevision vmv1.Revision } @@ -889,7 +892,7 @@ func (s *state) updateNeonVMCurrentRevision(currentRevision vmv1.RevisionWithTim revsource.Propagate(currentRevision.UpdatedAt.Time, s.NeonVM.TargetRevision, &s.NeonVM.CurrentRevision, - s.Config.PromMetricsCallbacks.NeonVMLatency, + s.Config.ObservabilityCallbacks.NeonVMLatency, ) err := s.Config.RevisionSource.Observe(currentRevision.UpdatedAt.Time, currentRevision.Revision) if err != nil { @@ -1106,7 +1109,7 @@ func (h PluginHandle) RequestSuccessful( revsource.Propagate(now, targetRevision, &h.s.Plugin.CurrentRevision, - h.s.Config.PromMetricsCallbacks.PluginLatency, + h.s.Config.ObservabilityCallbacks.PluginLatency, ) return nil } @@ -1181,7 +1184,7 @@ func (h MonitorHandle) DownscaleRequestAllowed(now time.Time, rev vmv1.RevisionW revsource.Propagate(now, rev, &h.s.Monitor.CurrentRevision, - h.s.Config.PromMetricsCallbacks.MonitorLatency, + h.s.Config.ObservabilityCallbacks.MonitorLatency, ) } @@ -1196,7 +1199,7 @@ func (h MonitorHandle) DownscaleRequestDenied(now time.Time, targetRevision vmv1 revsource.Propagate(now, targetRevision, &h.s.Monitor.CurrentRevision, - h.s.Config.PromMetricsCallbacks.MonitorLatency, + h.s.Config.ObservabilityCallbacks.MonitorLatency, ) } diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index f72912eb7..79b6dd5e0 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -132,7 +132,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { }, }, RevisionSource: revsource.NewRevisionSource(nil), - PromMetricsCallbacks: core.ObservabilityCallbacks{ + ObservabilityCallbacks: core.ObservabilityCallbacks{ PluginLatency: nil, MonitorLatency: nil, NeonVMLatency: nil, @@ -206,7 +206,7 @@ var DefaultInitialStateConfig = helpers.InitialStateConfig{ Warn: nil, }, RevisionSource: &helpers.NilRevisionSource{}, - PromMetricsCallbacks: core.ObservabilityCallbacks{ + ObservabilityCallbacks: core.ObservabilityCallbacks{ PluginLatency: nil, MonitorLatency: nil, NeonVMLatency: nil, @@ -503,7 +503,7 @@ func TestPeriodicPluginRequest(t *testing.T) { DefaultInitialStateConfig, helpers.WithStoredWarnings(a.StoredWarnings()), helpers.WithConfigSetting(func(c *core.Config) { - c.PromMetricsCallbacks.PluginLatency = latencyObserver.observe + c.ObservabilityCallbacks.PluginLatency = latencyObserver.observe // This time, we will test plugin latency c.RevisionSource = revsource.NewRevisionSource(nil) }), diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go index 73af4d819..24d0a7317 100644 --- a/pkg/agent/runner.go +++ b/pkg/agent/runner.go @@ -216,7 +216,7 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util Warn: coreExecLogger.Warn, }, RevisionSource: revisionSource, - PromMetricsCallbacks: core.ObservabilityCallbacks{ + ObservabilityCallbacks: core.ObservabilityCallbacks{ PluginLatency: WrapHistogramVec(&r.global.metrics.pluginLatency), MonitorLatency: WrapHistogramVec(&r.global.metrics.monitorLatency), NeonVMLatency: WrapHistogramVec(&r.global.metrics.neonvmLatency), From 84fa06278479effd5eef33f816ed2413c3ff25e6 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Sun, 21 Jul 2024 16:07:18 +0400 Subject: [PATCH 46/57] rollback extra diff Signed-off-by: Oleg Vasilev --- pkg/agent/core/state.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 23a7f4b03..286629e95 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -234,7 +234,7 @@ func (ns *neonvmState) ongoingRequest() bool { } func NewState(vm api.VmInfo, config Config) *State { - state := &State{ + return &State{ internal: state{ Config: config, Debug: false, @@ -267,8 +267,6 @@ func NewState(vm api.VmInfo, config Config) *State { TargetRevision: vmv1.ZeroRevision, }, } - - return state } func (s *state) info(msg string, fields ...zap.Field) { From 0df27ac9974d4e06dd32eb186053e1e062c3d00f Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Sun, 21 Jul 2024 16:22:48 +0400 Subject: [PATCH 47/57] add initial revision Signed-off-by: Oleg Vasilev --- pkg/agent/core/revsource/revsource.go | 4 ++-- pkg/agent/core/revsource/revsource_test.go | 8 +++++++- pkg/agent/core/state_test.go | 12 ++++++------ pkg/agent/runner.go | 12 ++++++++---- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/pkg/agent/core/revsource/revsource.go b/pkg/agent/core/revsource/revsource.go index dc4a97064..306927fac 100644 --- a/pkg/agent/core/revsource/revsource.go +++ b/pkg/agent/core/revsource/revsource.go @@ -30,11 +30,11 @@ type RevisionSource struct { offset int64 } -func NewRevisionSource(cb ObserveCallback) *RevisionSource { +func NewRevisionSource(initialRevision int64, cb ObserveCallback) *RevisionSource { return &RevisionSource{ cb: cb, measurements: nil, - offset: 1, // Start with 1, 0 is reserved for default value. + offset: initialRevision + 1, // Will start from the next one } } diff --git a/pkg/agent/core/revsource/revsource_test.go b/pkg/agent/core/revsource/revsource_test.go index 2a663c61f..87fe801c9 100644 --- a/pkg/agent/core/revsource/revsource_test.go +++ b/pkg/agent/core/revsource/revsource_test.go @@ -46,7 +46,7 @@ func newTestRevisionSource(t *testing.T) *testRevisionSource { tcm.result = &d tcm.resultFlags = &flags } - tcm.RevisionSource = revsource.NewRevisionSource(cb) + tcm.RevisionSource = revsource.NewRevisionSource(0, cb) return tcm } @@ -110,3 +110,9 @@ func TestStale(t *testing.T) { assert.NoError(t, err) assert.Nil(t, trs.result) } + +func TestNonZeroRev(t *testing.T) { + revSource := revsource.NewRevisionSource(5, nil) + rev := revSource.Next(time.Now(), 0) + assert.Equal(t, int64(6), rev.Value) +} diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 79b6dd5e0..8c6d3d013 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -131,7 +131,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { warnings = append(warnings, msg) }, }, - RevisionSource: revsource.NewRevisionSource(nil), + RevisionSource: revsource.NewRevisionSource(0, nil), ObservabilityCallbacks: core.ObservabilityCallbacks{ PluginLatency: nil, MonitorLatency: nil, @@ -297,7 +297,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { helpers.WithStoredWarnings(a.StoredWarnings()), helpers.WithTestingLogfWarnings(t), helpers.WithConfigSetting(func(c *core.Config) { - c.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) + c.RevisionSource = revsource.NewRevisionSource(0, latencyObserver.observe) }), ) nextActions := func() core.ActionSet { @@ -505,7 +505,7 @@ func TestPeriodicPluginRequest(t *testing.T) { helpers.WithConfigSetting(func(c *core.Config) { c.ObservabilityCallbacks.PluginLatency = latencyObserver.observe // This time, we will test plugin latency - c.RevisionSource = revsource.NewRevisionSource(nil) + c.RevisionSource = revsource.NewRevisionSource(0, nil) }), ) @@ -857,7 +857,7 @@ func TestRequestedUpscale(t *testing.T) { DefaultInitialStateConfig, helpers.WithStoredWarnings(a.StoredWarnings()), helpers.WithConfigSetting(func(c *core.Config) { - c.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) + c.RevisionSource = revsource.NewRevisionSource(0, latencyObserver.observe) c.MonitorRequestedUpscaleValidPeriod = duration("6s") // Override this for consistency }), ) @@ -1175,7 +1175,7 @@ func TestDownscalePivotBack(t *testing.T) { helpers.WithMinMaxCU(1, 3), helpers.WithCurrentCU(2), helpers.WithConfigSetting(func(c *core.Config) { - c.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) + c.RevisionSource = revsource.NewRevisionSource(0, latencyObserver.observe) }), ) @@ -1240,7 +1240,7 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { helpers.WithMinMaxCU(1, 3), helpers.WithCurrentCU(2), helpers.WithConfigSetting(func(config *core.Config) { - config.RevisionSource = revsource.NewRevisionSource(latencyObserver.observe) + config.RevisionSource = revsource.NewRevisionSource(0, latencyObserver.observe) }), ) nextActions := func() core.ActionSet { diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go index 24d0a7317..203cfd526 100644 --- a/pkg/agent/runner.go +++ b/pkg/agent/runner.go @@ -196,10 +196,14 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util pluginRequestJitter := util.NewTimeRange(time.Millisecond, 0, 100).Random() coreExecLogger := execLogger.Named("core") - revisionSource := revsource.NewRevisionSource( - WrapHistogramVec(&r.global.metrics.scalingLatency), - ) - executorCore := executor.NewExecutorCore(coreExecLogger, getVmInfo(), executor.Config{ + + vmInfo := getVmInfo() + var initialRevision int64 + if vmInfo.CurrentRevision != nil { + initialRevision = vmInfo.CurrentRevision.Value + } + revisionSource := revsource.NewRevisionSource(initialRevision, WrapHistogramVec(&r.global.metrics.scalingLatency)) + executorCore := executor.NewExecutorCore(coreExecLogger, vmInfo, executor.Config{ OnNextActions: r.global.metrics.runnerNextActions.Inc, Core: core.Config{ ComputeUnit: r.global.config.Scaling.ComputeUnit, From d9dee983f2fabd6d621c12392b51e51f14f54a8e Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Sun, 21 Jul 2024 16:23:39 +0400 Subject: [PATCH 48/57] fix revision updating when it is the same Signed-off-by: Oleg Vasilev --- neonvm/controllers/vm_controller.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/neonvm/controllers/vm_controller.go b/neonvm/controllers/vm_controller.go index e5ce1fc80..a14946316 100644 --- a/neonvm/controllers/vm_controller.go +++ b/neonvm/controllers/vm_controller.go @@ -802,14 +802,25 @@ func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine) // Propagate TargetRevision to CurrentRevision. This is done only if the VM is fully // reconciled and running. - if vm.Status.Phase == vmv1.VmRunning && vm.Spec.TargetRevision != nil { - rev := vm.Spec.TargetRevision.WithTime(time.Now()) - vm.Status.CurrentRevision = &rev + if vm.Status.Phase == vmv1.VmRunning { + propagateRevision(vm) } return nil } +func propagateRevision(vm *vmv1.VirtualMachine) { + if vm.Spec.TargetRevision == nil { + return + } + if vm.Status.CurrentRevision != nil && + vm.Status.CurrentRevision.Revision == vm.Spec.TargetRevision.Revision { + return + } + rev := vm.Spec.TargetRevision.WithTime(time.Now()) + vm.Status.CurrentRevision = &rev +} + func pickMemoryProvider(config *ReconcilerConfig, vm *vmv1.VirtualMachine) vmv1.MemoryProvider { if p := vm.Spec.Guest.MemoryProvider; p != nil { return *p From 2099577569948588bb14ef2097bf409feb919b44 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Sun, 21 Jul 2024 16:29:25 +0400 Subject: [PATCH 49/57] don't propagate if we are already current Signed-off-by: Oleg Vasilev --- pkg/agent/core/revsource/revsource.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/agent/core/revsource/revsource.go b/pkg/agent/core/revsource/revsource.go index 306927fac..d28e5d0b8 100644 --- a/pkg/agent/core/revsource/revsource.go +++ b/pkg/agent/core/revsource/revsource.go @@ -91,15 +91,15 @@ func Propagate( currentSlot *vmv1.Revision, cb ObserveCallback, ) { - if cb != nil { - diff := now.Sub(target.UpdatedAt.Time) - cb(diff, target.Flags) - } if currentSlot == nil { return } - if currentSlot.Value > target.Value { + if currentSlot.Value >= target.Value { return } + if cb != nil { + diff := now.Sub(target.UpdatedAt.Time) + cb(diff, target.Flags) + } *currentSlot = target.Revision } From 3e3c765f5fe252250d70e0bd47fbbd805dc05233 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 22 Jul 2024 01:13:10 +0400 Subject: [PATCH 50/57] fix the test Signed-off-by: Oleg Vasilev --- pkg/agent/core/state_test.go | 6 ++-- .../e2e/autoscaling.revisions/01-assert.yaml | 25 ++------------- .../01-change-target-revision.yaml | 17 ++++++++++ .../e2e/autoscaling.revisions/02-assert.yaml | 32 +++++++++++++++++++ .../{01-upscale.yaml => 02-upscale.yaml} | 0 5 files changed, 54 insertions(+), 26 deletions(-) create mode 100644 tests/e2e/autoscaling.revisions/01-change-target-revision.yaml create mode 100644 tests/e2e/autoscaling.revisions/02-assert.yaml rename tests/e2e/autoscaling.revisions/{01-upscale.yaml => 02-upscale.yaml} (100%) diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 8c6d3d013..01b1ba684 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -503,8 +503,8 @@ func TestPeriodicPluginRequest(t *testing.T) { DefaultInitialStateConfig, helpers.WithStoredWarnings(a.StoredWarnings()), helpers.WithConfigSetting(func(c *core.Config) { - c.ObservabilityCallbacks.PluginLatency = latencyObserver.observe // This time, we will test plugin latency + c.ObservabilityCallbacks.PluginLatency = latencyObserver.observe c.RevisionSource = revsource.NewRevisionSource(0, nil) }), ) @@ -528,7 +528,6 @@ func TestPeriodicPluginRequest(t *testing.T) { endTime := duration("20s") doInitialPluginRequest(a, state, clock, clockTick, lo.ToPtr(metrics.ToAPI()), resources) - latencyObserver.assert(duration("100ms"), 0) for clock.Elapsed().Duration < endTime { timeSinceScheduledRequest := (clock.Elapsed().Duration - base) % reqEvery @@ -558,7 +557,6 @@ func TestPeriodicPluginRequest(t *testing.T) { Migrate: nil, }) clock.Inc(clockTick - reqDuration) - latencyObserver.assert(reqDuration, 0) } } } @@ -573,6 +571,8 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { clock.Inc(clockTickDuration) } expectedRevision := helpers.NewExpectedRevision(clock.Now) + latencyObserver := &latencyObserver{t: t, observations: nil} + defer latencyObserver.assertEmpty() resForCU := DefaultComputeUnit.Mul state := helpers.CreateInitialState( diff --git a/tests/e2e/autoscaling.revisions/01-assert.yaml b/tests/e2e/autoscaling.revisions/01-assert.yaml index b3e818c36..a65939fea 100644 --- a/tests/e2e/autoscaling.revisions/01-assert.yaml +++ b/tests/e2e/autoscaling.revisions/01-assert.yaml @@ -6,29 +6,8 @@ apiVersion: vm.neon.tech/v1 kind: VirtualMachine metadata: name: example -spec: - targetRevision: - # Note that revision goes backward, compared with the previous step. - # This is intentional, this will happen each time autoscaler-agent restarts. - revision: - value: 1 - flags: 1 # 1 for Upscale status: - phase: Running - restartCount: 0 - conditions: - - type: Available - status: "True" - cpus: 1 - memorySize: 4Gi - currentRevision: # Already propagated from above + currentRevision: revision: value: 1 - flags: 1 ---- -apiVersion: v1 -kind: pod -metadata: - name: workload -status: - phase: Running + flags: 123 diff --git a/tests/e2e/autoscaling.revisions/01-change-target-revision.yaml b/tests/e2e/autoscaling.revisions/01-change-target-revision.yaml new file mode 100644 index 000000000..716d7b309 --- /dev/null +++ b/tests/e2e/autoscaling.revisions/01-change-target-revision.yaml @@ -0,0 +1,17 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +unitTest: false +--- + +apiVersion: vm.neon.tech/v1 +kind: VirtualMachine +metadata: + name: example +spec: + targetRevision: + # Note that revision goes backward, compared with the previous step. + # This is intentional, in case it races with autoscaler-agent restarts. + revision: + value: 1 + flags: 123 + updatedAt: 2020-01-02T15:04:05Z diff --git a/tests/e2e/autoscaling.revisions/02-assert.yaml b/tests/e2e/autoscaling.revisions/02-assert.yaml new file mode 100644 index 000000000..05d504fc9 --- /dev/null +++ b/tests/e2e/autoscaling.revisions/02-assert.yaml @@ -0,0 +1,32 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 90 +--- +apiVersion: vm.neon.tech/v1 +kind: VirtualMachine +metadata: + name: example +spec: + targetRevision: + revision: + value: 1 + flags: 1 # 1 for Upscale +status: + phase: Running + restartCount: 0 + conditions: + - type: Available + status: "True" + cpus: 1 + memorySize: 4Gi + currentRevision: # Already propagated from above + revision: + value: 1 + flags: 1 +--- +apiVersion: v1 +kind: pod +metadata: + name: workload +status: + phase: Running diff --git a/tests/e2e/autoscaling.revisions/01-upscale.yaml b/tests/e2e/autoscaling.revisions/02-upscale.yaml similarity index 100% rename from tests/e2e/autoscaling.revisions/01-upscale.yaml rename to tests/e2e/autoscaling.revisions/02-upscale.yaml From 7e5ad35399afae43a9a0ab790499171a274dc58b Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 22 Jul 2024 01:15:30 +0400 Subject: [PATCH 51/57] fix test Signed-off-by: Oleg Vasilev --- tests/e2e/autoscaling.revisions/02-assert.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/autoscaling.revisions/02-assert.yaml b/tests/e2e/autoscaling.revisions/02-assert.yaml index 05d504fc9..462857775 100644 --- a/tests/e2e/autoscaling.revisions/02-assert.yaml +++ b/tests/e2e/autoscaling.revisions/02-assert.yaml @@ -9,7 +9,7 @@ metadata: spec: targetRevision: revision: - value: 1 + value: 2 # 1 was last step flags: 1 # 1 for Upscale status: phase: Running From f2694247e1343bb6d3a04c5789a391a872c6c825 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 22 Jul 2024 01:16:59 +0400 Subject: [PATCH 52/57] fix test Signed-off-by: Oleg Vasilev --- tests/e2e/autoscaling.revisions/02-assert.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/autoscaling.revisions/02-assert.yaml b/tests/e2e/autoscaling.revisions/02-assert.yaml index 462857775..50843e53b 100644 --- a/tests/e2e/autoscaling.revisions/02-assert.yaml +++ b/tests/e2e/autoscaling.revisions/02-assert.yaml @@ -9,7 +9,7 @@ metadata: spec: targetRevision: revision: - value: 2 # 1 was last step + value: 124 # we had 123 as the initial revision flags: 1 # 1 for Upscale status: phase: Running From 52283351fa446fb248dbdc2ef7c7cc084d5314f6 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 22 Jul 2024 01:19:35 +0400 Subject: [PATCH 53/57] fix test Signed-off-by: Oleg Vasilev --- tests/e2e/autoscaling.revisions/02-assert.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/autoscaling.revisions/02-assert.yaml b/tests/e2e/autoscaling.revisions/02-assert.yaml index 50843e53b..b8b35bb47 100644 --- a/tests/e2e/autoscaling.revisions/02-assert.yaml +++ b/tests/e2e/autoscaling.revisions/02-assert.yaml @@ -21,7 +21,7 @@ status: memorySize: 4Gi currentRevision: # Already propagated from above revision: - value: 1 + value: 124 flags: 1 --- apiVersion: v1 From 8dcd87ac689e641384fe2cd5af5aece9fda3e3ad Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 22 Jul 2024 11:21:57 +0400 Subject: [PATCH 54/57] add new test Signed-off-by: Oleg Vasilev --- pkg/agent/core/state_test.go | 132 +++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 01b1ba684..47a31d3c5 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -561,6 +561,138 @@ func TestPeriodicPluginRequest(t *testing.T) { } } +// In this test agent wants to upscale from 1 CU to 4 CU, but the plugin only allows 3 CU. +// Agent upscales to 3 CU, then tries to upscale to 4 CU again. +func TestPartialUpscaleThenFull(t *testing.T) { + a := helpers.NewAssert(t) + clock := helpers.NewFakeClock(t) + clockTickDuration := duration("0.1s") + clockTick := func() { + clock.Inc(clockTickDuration) + } + expectedRevision := helpers.NewExpectedRevision(clock.Now) + scalingLatencyObserver := &latencyObserver{t: t, observations: nil} + defer scalingLatencyObserver.assertEmpty() + + pluginLatencyObserver := &latencyObserver{t: t, observations: nil} + defer pluginLatencyObserver.assertEmpty() + + resForCU := DefaultComputeUnit.Mul + + state := helpers.CreateInitialState( + DefaultInitialStateConfig, + helpers.WithStoredWarnings(a.StoredWarnings()), + helpers.WithMinMaxCU(1, 4), + helpers.WithCurrentCU(1), + helpers.WithConfigSetting(func(c *core.Config) { + c.RevisionSource = revsource.NewRevisionSource(0, scalingLatencyObserver.observe) + c.ObservabilityCallbacks.PluginLatency = pluginLatencyObserver.observe + }), + ) + + nextActions := func() core.ActionSet { + return state.NextActions(clock.Now()) + } + + state.Monitor().Active(true) + + doInitialPluginRequest(a, state, clock, duration("0.1s"), nil, resForCU(1)) + + // Set metrics + clockTick() + metrics := core.SystemMetrics{ + LoadAverage1Min: 1.0, + MemoryUsageBytes: 12345678, + } + a.Do(state.UpdateSystemMetrics, metrics) + + // double-check that we agree about the desired resources + a.Call(getDesiredResources, state, clock.Now()). + Equals(resForCU(4)) + + // Upscaling to 4 CU + expectedRevision.Value = 1 + expectedRevision.Flags = revsource.Upscale + targetRevision := expectedRevision.WithTime() + a.Call(nextActions).Equals(core.ActionSet{ + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(1)), + Target: resForCU(4), + Metrics: lo.ToPtr(metrics.ToAPI()), + TargetRevision: targetRevision, + }, + }) + + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(4)) + clockTick() + a.Call(nextActions).Equals(core.ActionSet{}) + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), targetRevision, api.PluginResponse{ + Permit: resForCU(3), + Migrate: nil, + }) + + pluginLatencyObserver.assert(duration("0.1s"), revsource.Upscale) + + // NeonVM request + a. + WithWarnings("Wanted to make a request to the scheduler plugin, but but previous request for more resources was denied too recently"). + Call(nextActions). + Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("1.9s")}, + NeonVMRequest: &core.ActionNeonVMRequest{ + Current: resForCU(1), + Target: resForCU(3), + TargetRevision: expectedRevision.WithTime(), + }, + }) + + a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(3)) + clockTick() + a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + clockTick() + a.Do(state.UpdatedVM, helpers.CreateVmInfo( + DefaultInitialStateConfig.VM, + helpers.WithCurrentCU(3), + helpers.WithCurrentRevision(expectedRevision.WithTime()), + )) + scalingLatencyObserver.assert(duration("0.3s"), revsource.Upscale) + + clock.Inc(duration("2s")) + + // Upscaling to 4 CU + targetRevision = expectedRevision.WithTime() + a.Call(nextActions).Equals(core.ActionSet{ + MonitorUpscale: &core.ActionMonitorUpscale{ + Current: resForCU(1), + Target: resForCU(3), + TargetRevision: expectedRevision.WithTime(), + }, + PluginRequest: &core.ActionPluginRequest{ + LastPermit: lo.ToPtr(resForCU(3)), + Target: resForCU(4), + Metrics: lo.ToPtr(metrics.ToAPI()), + TargetRevision: expectedRevision.WithTime(), + }, + }) + + a.Do(state.Monitor().StartingUpscaleRequest, clock.Now(), resForCU(3)) + a.Do(state.Plugin().StartingRequest, clock.Now(), resForCU(4)) + clockTick() + a.Do(state.Monitor().UpscaleRequestSuccessful, clock.Now()) + a.NoError(state.Plugin().RequestSuccessful, clock.Now(), targetRevision, api.PluginResponse{ + Permit: resForCU(4), + Migrate: nil, + }) + a.Call(nextActions).Equals(core.ActionSet{ + Wait: &core.ActionWait{Duration: duration("4.9s")}, + NeonVMRequest: &core.ActionNeonVMRequest{ + Current: resForCU(3), + Target: resForCU(4), + TargetRevision: expectedRevision.WithTime(), + }, + }) +} + // Checks that when downscaling is denied, we both (a) try again with higher resources, or (b) wait // to retry if there aren't higher resources to try with. func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { From 49bd61af4dcb1678bbaafaaca1bbea643868a3b9 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 22 Jul 2024 11:27:46 +0400 Subject: [PATCH 55/57] change behaviour to count both upscalings Signed-off-by: Oleg Vasilev --- pkg/agent/core/state.go | 5 +++++ pkg/agent/core/state_test.go | 14 ++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index 286629e95..fa44029f5 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -893,6 +893,11 @@ func (s *state) updateNeonVMCurrentRevision(currentRevision vmv1.RevisionWithTim s.Config.ObservabilityCallbacks.NeonVMLatency, ) err := s.Config.RevisionSource.Observe(currentRevision.UpdatedAt.Time, currentRevision.Revision) + + // We also zero out LastDesiredResources, because we are now starting from + // a new current resources. + s.LastDesiredResources = nil + if err != nil { s.warnf("Failed to observe clock source: %v", err) } diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 47a31d3c5..8f26a8a06 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -660,6 +660,7 @@ func TestPartialUpscaleThenFull(t *testing.T) { clock.Inc(duration("2s")) // Upscaling to 4 CU + expectedRevision.Value += 1 targetRevision = expectedRevision.WithTime() a.Call(nextActions).Equals(core.ActionSet{ MonitorUpscale: &core.ActionMonitorUpscale{ @@ -683,6 +684,7 @@ func TestPartialUpscaleThenFull(t *testing.T) { Permit: resForCU(4), Migrate: nil, }) + pluginLatencyObserver.assert(duration("0.1s"), revsource.Upscale) a.Call(nextActions).Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("4.9s")}, NeonVMRequest: &core.ActionNeonVMRequest{ @@ -691,6 +693,18 @@ func TestPartialUpscaleThenFull(t *testing.T) { TargetRevision: expectedRevision.WithTime(), }, }) + a.Do(state.NeonVM().StartingRequest, clock.Now(), resForCU(4)) + clockTick() + a.Do(state.NeonVM().RequestSuccessful, clock.Now()) + vmInfo := helpers.CreateVmInfo( + DefaultInitialStateConfig.VM, + helpers.WithCurrentCU(4), + helpers.WithCurrentRevision(expectedRevision.WithTime()), + ) + clockTick() + a.Do(state.UpdatedVM, vmInfo) + + scalingLatencyObserver.assert(duration("0.2s"), revsource.Upscale) } // Checks that when downscaling is denied, we both (a) try again with higher resources, or (b) wait From 3f59c87c5a6cf070db12486f03163cf60eefc39a Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 22 Jul 2024 20:37:22 +0400 Subject: [PATCH 56/57] final thing Signed-off-by: Oleg Vasilev --- pkg/agent/core/state.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go index fa44029f5..9dbfcb4a6 100644 --- a/pkg/agent/core/state.go +++ b/pkg/agent/core/state.go @@ -893,14 +893,13 @@ func (s *state) updateNeonVMCurrentRevision(currentRevision vmv1.RevisionWithTim s.Config.ObservabilityCallbacks.NeonVMLatency, ) err := s.Config.RevisionSource.Observe(currentRevision.UpdatedAt.Time, currentRevision.Revision) + if err != nil { + s.warnf("Failed to observe clock source: %v", err) + } // We also zero out LastDesiredResources, because we are now starting from // a new current resources. s.LastDesiredResources = nil - - if err != nil { - s.warnf("Failed to observe clock source: %v", err) - } } func (s *state) timeUntilRequestedUpscalingExpired(now time.Time) time.Duration { From 5a2355eef416c39de8252162f8471ca73cb323e4 Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Mon, 22 Jul 2024 21:39:26 +0400 Subject: [PATCH 57/57] but but Signed-off-by: Oleg Vasilev --- pkg/agent/core/state_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index 8f26a8a06..0b86192c5 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -635,7 +635,7 @@ func TestPartialUpscaleThenFull(t *testing.T) { // NeonVM request a. - WithWarnings("Wanted to make a request to the scheduler plugin, but but previous request for more resources was denied too recently"). + WithWarnings("Wanted to make a request to the scheduler plugin, but previous request for more resources was denied too recently"). Call(nextActions). Equals(core.ActionSet{ Wait: &core.ActionWait{Duration: duration("1.9s")},