neonvm: Use memory_hotplug.online_policy=auto-movable for virtio-mem (#…

…981) In short: Currently we hotplug all memory as ZONE_MOVABLE. This has known issues because there are theoretical and practical limits on the ratio between ZONE_MOVABLE and ZONE_NORMAL. With 'memory_hotplug.online_policy=auto-movable', we can use the config value 'memory_hotplug.auto_movable_ratio' to set a ratio of MOVABLE:NORMAL to uphold during hotplug and the kernel will Do The Right Thing™. This change adds a new flag to neonvm-controller and neonvm-runner to set the value of this ratio: '-memhp-auto-movable-ratio'. However, please note: The new behavior is only available for VMs using virito-mem because memory_hotplug.auto_movable_ratio does not count hotplugged ZONE_NORMAL DIMM slot towards the ratio, so IMO it's not worth changing the existing behavior. [1] [2] [3] This is the productionized version of the experimentation from #959. For more on that, see [4]. [1]: https://docs.kernel.org/admin-guide/mm/memory-hotplug.html#module-parameters [2]: https://lwn.net/Articles/865618/ [3]: https://neondb.slack.com/archives/C03TN5G758R/p1718225978299459?thread_ts=1718158200.324759 [4]: https://www.notion.so/neondatabase/59c9e9b2619e4ccd8fbb99e9b0cb1a89
neondatabase · Jun 22, 2024 · 41169c1 · 41169c1
1 parent da8abff
commit 41169c1
Show file tree

Hide file tree

Showing 7 changed files with 42 additions and 3 deletions.
diff --git a/neonvm/config/controller/deployment.yaml b/neonvm/config/controller/deployment.yaml
@@ -60,6 +60,7 @@ spec:
         # * cache.no-flush=on  - ignores disk flush operations (not needed; our disks are ephemeral)
         - "--qemu-disk-cache-settings=cache.writeback=on,cache.direct=on,cache.no-flush=on"
         - "--default-memory-provider=DIMMSlots"
+        - "--memhp-auto-movable-ratio=801" # for virtio-mem, set memory_hotplug.auto_movable_ratio=801
         - "--failure-pending-period=1m"
         - "--failing-refresh-interval=15s"
         env:

diff --git a/neonvm/controllers/config.go b/neonvm/controllers/config.go
@@ -33,6 +33,14 @@ type ReconcilerConfig struct {
 	// new VMs (or, when old ones restart) if nothing is explicitly set.
 	DefaultMemoryProvider vmv1.MemoryProvider
 
+	// MemhpAutoMovableRatio specifies the value that new neonvm-runners will set as the
+	// kernel's 'memory_hotplug.auto_movable_ratio', iff the memory provider is virtio-mem.
+	//
+	// This value is passed directly to neonvm-runner as the '-memhp-auto-movable-ratio' flag.
+	// We've confirmed sensible values are from 301 to 801 (i.e. 3.01:1 through 8.01:1).
+	// The range of sensible values may extend further, but we have not tested that.
+	MemhpAutoMovableRatio string
+
 	// FailurePendingPeriod is the period for the propagation of
 	// reconciliation failures to the observability instruments
 	FailurePendingPeriod time.Duration

diff --git a/neonvm/controllers/functests/vm_controller_test.go b/neonvm/controllers/functests/vm_controller_test.go
@@ -112,6 +112,7 @@ var _ = Describe("VirtualMachine controller", func() {
 					MaxConcurrentReconciles: 1,
 					QEMUDiskCacheSettings:   "cache=none",
 					DefaultMemoryProvider:   vmv1.MemoryProviderDIMMSlots,
+					MemhpAutoMovableRatio:   "301",
 					FailurePendingPeriod:    1 * time.Minute,
 					FailingRefreshInterval:  1 * time.Minute,
 				},

diff --git a/neonvm/controllers/vm_controller.go b/neonvm/controllers/vm_controller.go
@@ -1461,15 +1461,21 @@ func podSpec(
 					}},
 					Command: func() []string {
 						cmd := []string{"runner"}
-						// intentionally add this first, so it's easier to see among the very long
-						// args that follow.
 						if config.UseContainerMgr {
 							cmd = append(cmd, "-skip-cgroup-management")
 						}
 						cmd = append(
 							cmd,
 							"-qemu-disk-cache-settings", config.QEMUDiskCacheSettings,
 							"-memory-provider", string(memoryProvider),
+						)
+						if memoryProvider == vmv1.MemoryProviderVirtioMem {
+							cmd = append(cmd, "-memhp-auto-movable-ratio", config.MemhpAutoMovableRatio)
+						}
+						// put these last, so that the earlier args are easier to see (because these
+						// can get quite large)
+						cmd = append(
+							cmd,
 							"-vmspec", base64.StdEncoding.EncodeToString(vmSpecJson),
 							"-vmstatus", base64.StdEncoding.EncodeToString(vmStatusJson),
 						)

diff --git a/neonvm/controllers/vm_controller_unit_test.go b/neonvm/controllers/vm_controller_unit_test.go
@@ -117,6 +117,7 @@ func newTestParams(t *testing.T) *testParams {
 			MaxConcurrentReconciles: 10,
 			QEMUDiskCacheSettings:   "",
 			DefaultMemoryProvider:   vmv1.MemoryProviderDIMMSlots,
+			MemhpAutoMovableRatio:   "301",
 			FailurePendingPeriod:    time.Minute,
 			FailingRefreshInterval:  time.Minute,
 		},

diff --git a/neonvm/main.go b/neonvm/main.go
@@ -99,6 +99,7 @@ func main() {
 	var enableContainerMgr bool
 	var qemuDiskCacheSettings string
 	var defaultMemoryProvider vmv1.MemoryProvider
+	var memhpAutoMovableRatio string
 	var failurePendingPeriod time.Duration
 	var failingRefreshInterval time.Duration
 	flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
@@ -110,6 +111,7 @@ func main() {
 	flag.BoolVar(&enableContainerMgr, "enable-container-mgr", false, "Enable crictl-based container-mgr alongside each VM")
 	flag.StringVar(&qemuDiskCacheSettings, "qemu-disk-cache-settings", "cache=none", "Set neonvm-runner's QEMU disk cache settings")
 	flag.Func("default-memory-provider", "Set default memory provider to use for new VMs", defaultMemoryProvider.FlagFunc)
+	flag.StringVar(&memhpAutoMovableRatio, "memhp-auto-movable-ratio", "301", "For virtio-mem, set VM kernel's memory_hotplug.auto_movable_ratio")
 	flag.DurationVar(&failurePendingPeriod, "failure-pending-period", 1*time.Minute,
 		"the period for the propagation of reconciliation failures to the observability instruments")
 	flag.DurationVar(&failingRefreshInterval, "failing-refresh-interval", 1*time.Minute,
@@ -175,6 +177,7 @@ func main() {
 		MaxConcurrentReconciles: concurrencyLimit,
 		QEMUDiskCacheSettings:   qemuDiskCacheSettings,
 		DefaultMemoryProvider:   defaultMemoryProvider,
+		MemhpAutoMovableRatio:   memhpAutoMovableRatio,
 		FailurePendingPeriod:    failurePendingPeriod,
 		FailingRefreshInterval:  failingRefreshInterval,
 	}

diff --git a/neonvm/runner/main.go b/neonvm/runner/main.go
@@ -596,6 +596,7 @@ type Config struct {
 	skipCgroupManagement bool
 	diskCacheSettings    string
 	memoryProvider       vmv1.MemoryProvider
+	autoMovableRatio     string
 }
 
 func newConfig(logger *zap.Logger) *Config {
@@ -607,6 +608,7 @@ func newConfig(logger *zap.Logger) *Config {
 		skipCgroupManagement: false,
 		diskCacheSettings:    "cache=none",
 		memoryProvider:       "", // Require that this is explicitly set. We'll check later.
+		autoMovableRatio:     "", // Require that this is explicitly set IFF memoryProvider is VirtioMem. We'll check later.
 	}
 	flag.StringVar(&cfg.vmSpecDump, "vmspec", cfg.vmSpecDump,
 		"Base64 encoded VirtualMachine json specification")
@@ -622,11 +624,17 @@ func newConfig(logger *zap.Logger) *Config {
 	flag.StringVar(&cfg.diskCacheSettings, "qemu-disk-cache-settings",
 		cfg.diskCacheSettings, "Cache settings to add to -drive args for VM disks")
 	flag.Func("memory-provider", "Set provider for memory hotplug", cfg.memoryProvider.FlagFunc)
+	flag.StringVar(&cfg.autoMovableRatio, "memhp-auto-movable-ratio",
+		cfg.autoMovableRatio, "Set value of kernel's memory_hotplug.auto_movable_ratio [virtio-mem only]")
+
 	flag.Parse()
 
 	if cfg.memoryProvider == "" {
 		logger.Fatal("missing required flag '-memory-provider'")
 	}
+	if cfg.memoryProvider == vmv1.MemoryProviderVirtioMem && cfg.autoMovableRatio == "" {
+		logger.Fatal("missing required flag '-memhp-auto-movable-ratio'")
+	}
 
 	return cfg
 }
@@ -911,12 +919,23 @@ func buildQEMUCmd(
 }
 
 const (
-	baseKernelCmdline = "panic=-1 init=/neonvm/bin/init memhp_default_state=online_movable console=ttyS1 loglevel=7 root=/dev/vda rw"
+	baseKernelCmdline          = "panic=-1 init=/neonvm/bin/init console=ttyS1 loglevel=7 root=/dev/vda rw"
+	kernelCmdlineDIMMSlots     = "memhp_default_state=online_movable"
+	kernelCmdlineVirtioMemTmpl = "memhp_default_state=online memory_hotplug.online_policy=auto-movable memory_hotplug.auto_movable_ratio=%s"
 )
 
 func makeKernelCmdline(cfg *Config, vmSpec *vmv1.VirtualMachineSpec, vmStatus *vmv1.VirtualMachineStatus) string {
 	cmdlineParts := []string{baseKernelCmdline}
 
+	switch cfg.memoryProvider {
+	case vmv1.MemoryProviderDIMMSlots:
+		cmdlineParts = append(cmdlineParts, kernelCmdlineDIMMSlots)
+	case vmv1.MemoryProviderVirtioMem:
+		cmdlineParts = append(cmdlineParts, fmt.Sprintf(kernelCmdlineVirtioMemTmpl, cfg.autoMovableRatio))
+	default:
+		panic(fmt.Errorf("unknown memory provider %s", cfg.memoryProvider))
+	}
+
 	if vmSpec.ExtraNetwork != nil && vmSpec.ExtraNetwork.Enable {
 		netDetails := fmt.Sprintf("ip=%s:::%s:%s:eth1:off", vmStatus.ExtraNetIP, vmStatus.ExtraNetMask, vmStatus.PodName)
 		cmdlineParts = append(cmdlineParts, netDetails)