Skip to content

Commit

Permalink
neonvm: Use memory_hotplug.online_policy=auto-movable for virtio-mem (#…
Browse files Browse the repository at this point in the history
…981)

In short: Currently we hotplug all memory as ZONE_MOVABLE. This has
known issues because there are theoretical and practical limits on the
ratio between ZONE_MOVABLE and ZONE_NORMAL.
With 'memory_hotplug.online_policy=auto-movable', we can use the config
value 'memory_hotplug.auto_movable_ratio' to set a ratio of MOVABLE:NORMAL
to uphold during hotplug and the kernel will Do The Right Thing™.

This change adds a new flag to neonvm-controller and neonvm-runner to
set the value of this ratio: '-memhp-auto-movable-ratio'.

However, please note: The new behavior is only available for VMs using
virito-mem because memory_hotplug.auto_movable_ratio does not count
hotplugged ZONE_NORMAL DIMM slot towards the ratio, so IMO it's not
worth changing the existing behavior. [1] [2] [3]

This is the productionized version of the experimentation from #959.
For more on that, see [4].

[1]: https://docs.kernel.org/admin-guide/mm/memory-hotplug.html#module-parameters
[2]: https://lwn.net/Articles/865618/
[3]: https://neondb.slack.com/archives/C03TN5G758R/p1718225978299459?thread_ts=1718158200.324759
[4]: https://www.notion.so/neondatabase/59c9e9b2619e4ccd8fbb99e9b0cb1a89
  • Loading branch information
sharnoff committed Jun 22, 2024
1 parent da8abff commit 41169c1
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 3 deletions.
1 change: 1 addition & 0 deletions neonvm/config/controller/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ spec:
# * cache.no-flush=on - ignores disk flush operations (not needed; our disks are ephemeral)
- "--qemu-disk-cache-settings=cache.writeback=on,cache.direct=on,cache.no-flush=on"
- "--default-memory-provider=DIMMSlots"
- "--memhp-auto-movable-ratio=801" # for virtio-mem, set memory_hotplug.auto_movable_ratio=801
- "--failure-pending-period=1m"
- "--failing-refresh-interval=15s"
env:
Expand Down
8 changes: 8 additions & 0 deletions neonvm/controllers/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@ type ReconcilerConfig struct {
// new VMs (or, when old ones restart) if nothing is explicitly set.
DefaultMemoryProvider vmv1.MemoryProvider

// MemhpAutoMovableRatio specifies the value that new neonvm-runners will set as the
// kernel's 'memory_hotplug.auto_movable_ratio', iff the memory provider is virtio-mem.
//
// This value is passed directly to neonvm-runner as the '-memhp-auto-movable-ratio' flag.
// We've confirmed sensible values are from 301 to 801 (i.e. 3.01:1 through 8.01:1).
// The range of sensible values may extend further, but we have not tested that.
MemhpAutoMovableRatio string

// FailurePendingPeriod is the period for the propagation of
// reconciliation failures to the observability instruments
FailurePendingPeriod time.Duration
Expand Down
1 change: 1 addition & 0 deletions neonvm/controllers/functests/vm_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ var _ = Describe("VirtualMachine controller", func() {
MaxConcurrentReconciles: 1,
QEMUDiskCacheSettings: "cache=none",
DefaultMemoryProvider: vmv1.MemoryProviderDIMMSlots,
MemhpAutoMovableRatio: "301",
FailurePendingPeriod: 1 * time.Minute,
FailingRefreshInterval: 1 * time.Minute,
},
Expand Down
10 changes: 8 additions & 2 deletions neonvm/controllers/vm_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1461,15 +1461,21 @@ func podSpec(
}},
Command: func() []string {
cmd := []string{"runner"}
// intentionally add this first, so it's easier to see among the very long
// args that follow.
if config.UseContainerMgr {
cmd = append(cmd, "-skip-cgroup-management")
}
cmd = append(
cmd,
"-qemu-disk-cache-settings", config.QEMUDiskCacheSettings,
"-memory-provider", string(memoryProvider),
)
if memoryProvider == vmv1.MemoryProviderVirtioMem {
cmd = append(cmd, "-memhp-auto-movable-ratio", config.MemhpAutoMovableRatio)
}
// put these last, so that the earlier args are easier to see (because these
// can get quite large)
cmd = append(
cmd,
"-vmspec", base64.StdEncoding.EncodeToString(vmSpecJson),
"-vmstatus", base64.StdEncoding.EncodeToString(vmStatusJson),
)
Expand Down
1 change: 1 addition & 0 deletions neonvm/controllers/vm_controller_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ func newTestParams(t *testing.T) *testParams {
MaxConcurrentReconciles: 10,
QEMUDiskCacheSettings: "",
DefaultMemoryProvider: vmv1.MemoryProviderDIMMSlots,
MemhpAutoMovableRatio: "301",
FailurePendingPeriod: time.Minute,
FailingRefreshInterval: time.Minute,
},
Expand Down
3 changes: 3 additions & 0 deletions neonvm/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ func main() {
var enableContainerMgr bool
var qemuDiskCacheSettings string
var defaultMemoryProvider vmv1.MemoryProvider
var memhpAutoMovableRatio string
var failurePendingPeriod time.Duration
var failingRefreshInterval time.Duration
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
Expand All @@ -110,6 +111,7 @@ func main() {
flag.BoolVar(&enableContainerMgr, "enable-container-mgr", false, "Enable crictl-based container-mgr alongside each VM")
flag.StringVar(&qemuDiskCacheSettings, "qemu-disk-cache-settings", "cache=none", "Set neonvm-runner's QEMU disk cache settings")
flag.Func("default-memory-provider", "Set default memory provider to use for new VMs", defaultMemoryProvider.FlagFunc)
flag.StringVar(&memhpAutoMovableRatio, "memhp-auto-movable-ratio", "301", "For virtio-mem, set VM kernel's memory_hotplug.auto_movable_ratio")
flag.DurationVar(&failurePendingPeriod, "failure-pending-period", 1*time.Minute,
"the period for the propagation of reconciliation failures to the observability instruments")
flag.DurationVar(&failingRefreshInterval, "failing-refresh-interval", 1*time.Minute,
Expand Down Expand Up @@ -175,6 +177,7 @@ func main() {
MaxConcurrentReconciles: concurrencyLimit,
QEMUDiskCacheSettings: qemuDiskCacheSettings,
DefaultMemoryProvider: defaultMemoryProvider,
MemhpAutoMovableRatio: memhpAutoMovableRatio,
FailurePendingPeriod: failurePendingPeriod,
FailingRefreshInterval: failingRefreshInterval,
}
Expand Down
21 changes: 20 additions & 1 deletion neonvm/runner/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,7 @@ type Config struct {
skipCgroupManagement bool
diskCacheSettings string
memoryProvider vmv1.MemoryProvider
autoMovableRatio string
}

func newConfig(logger *zap.Logger) *Config {
Expand All @@ -607,6 +608,7 @@ func newConfig(logger *zap.Logger) *Config {
skipCgroupManagement: false,
diskCacheSettings: "cache=none",
memoryProvider: "", // Require that this is explicitly set. We'll check later.
autoMovableRatio: "", // Require that this is explicitly set IFF memoryProvider is VirtioMem. We'll check later.
}
flag.StringVar(&cfg.vmSpecDump, "vmspec", cfg.vmSpecDump,
"Base64 encoded VirtualMachine json specification")
Expand All @@ -622,11 +624,17 @@ func newConfig(logger *zap.Logger) *Config {
flag.StringVar(&cfg.diskCacheSettings, "qemu-disk-cache-settings",
cfg.diskCacheSettings, "Cache settings to add to -drive args for VM disks")
flag.Func("memory-provider", "Set provider for memory hotplug", cfg.memoryProvider.FlagFunc)
flag.StringVar(&cfg.autoMovableRatio, "memhp-auto-movable-ratio",
cfg.autoMovableRatio, "Set value of kernel's memory_hotplug.auto_movable_ratio [virtio-mem only]")

flag.Parse()

if cfg.memoryProvider == "" {
logger.Fatal("missing required flag '-memory-provider'")
}
if cfg.memoryProvider == vmv1.MemoryProviderVirtioMem && cfg.autoMovableRatio == "" {
logger.Fatal("missing required flag '-memhp-auto-movable-ratio'")
}

return cfg
}
Expand Down Expand Up @@ -911,12 +919,23 @@ func buildQEMUCmd(
}

const (
baseKernelCmdline = "panic=-1 init=/neonvm/bin/init memhp_default_state=online_movable console=ttyS1 loglevel=7 root=/dev/vda rw"
baseKernelCmdline = "panic=-1 init=/neonvm/bin/init console=ttyS1 loglevel=7 root=/dev/vda rw"
kernelCmdlineDIMMSlots = "memhp_default_state=online_movable"
kernelCmdlineVirtioMemTmpl = "memhp_default_state=online memory_hotplug.online_policy=auto-movable memory_hotplug.auto_movable_ratio=%s"
)

func makeKernelCmdline(cfg *Config, vmSpec *vmv1.VirtualMachineSpec, vmStatus *vmv1.VirtualMachineStatus) string {
cmdlineParts := []string{baseKernelCmdline}

switch cfg.memoryProvider {
case vmv1.MemoryProviderDIMMSlots:
cmdlineParts = append(cmdlineParts, kernelCmdlineDIMMSlots)
case vmv1.MemoryProviderVirtioMem:
cmdlineParts = append(cmdlineParts, fmt.Sprintf(kernelCmdlineVirtioMemTmpl, cfg.autoMovableRatio))
default:
panic(fmt.Errorf("unknown memory provider %s", cfg.memoryProvider))
}

if vmSpec.ExtraNetwork != nil && vmSpec.ExtraNetwork.Enable {
netDetails := fmt.Sprintf("ip=%s:::%s:%s:eth1:off", vmStatus.ExtraNetIP, vmStatus.ExtraNetMask, vmStatus.PodName)
cmdlineParts = append(cmdlineParts, netDetails)
Expand Down

0 comments on commit 41169c1

Please sign in to comment.