Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

neonvm: Add controller flag to disable runner cgroup #1034

Merged
merged 3 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions neonvm/controllers/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ type ReconcilerConfig struct {
// This is defined as a config option so we can do a gradual rollout of this change.
UseContainerMgr bool

// DisableRunnerCgroup, if true, disables running QEMU in a cgroup in new VM runner pods.
// Fractional CPU scaling will continue to *pretend* to work, but it will not do anything in
// practice.
//
// Under the hood, this results in passing -skip-cgroup-management and -enable-dummy-cpu-server
// to neonvm-runner.
DisableRunnerCgroup bool

MaxConcurrentReconciles int

// SkipUpdateValidationFor is the set of object names that we should ignore when doing webhook
Expand Down
1 change: 1 addition & 0 deletions neonvm/controllers/functests/vm_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ var _ = Describe("VirtualMachine controller", func() {
Config: &controllers.ReconcilerConfig{
IsK3s: false,
UseContainerMgr: true,
DisableRunnerCgroup: false,
MaxConcurrentReconciles: 1,
SkipUpdateValidationFor: nil,
QEMUDiskCacheSettings: "cache=none",
Expand Down
11 changes: 9 additions & 2 deletions neonvm/controllers/vm_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1486,9 +1486,14 @@ func podSpec(
}},
Command: func() []string {
cmd := []string{"runner"}
if config.UseContainerMgr {
if config.UseContainerMgr || config.DisableRunnerCgroup {
cmd = append(cmd, "-skip-cgroup-management")
}
if config.DisableRunnerCgroup {
// cgroup management disabled, but we still need something to provide
// the server, so the runner will just provide a dummy implementation.
cmd = append(cmd, "-enable-dummy-cpu-server")
}
cmd = append(
cmd,
"-qemu-disk-cache-settings", config.QEMUDiskCacheSettings,
Expand Down Expand Up @@ -1530,7 +1535,7 @@ func podSpec(
MountPropagation: lo.ToPtr(corev1.MountPropagationNone),
}

if config.UseContainerMgr {
if config.UseContainerMgr || config.DisableRunnerCgroup {
return []corev1.VolumeMount{images}
} else {
// the /sys/fs/cgroup mount is only necessary if neonvm-runner has to
Expand Down Expand Up @@ -1625,6 +1630,8 @@ func podSpec(

if config.UseContainerMgr {
return []corev1.Volume{images, containerdSock}
} else if config.DisableRunnerCgroup {
return []corev1.Volume{images}
} else {
return []corev1.Volume{images, cgroup}
}
Expand Down
1 change: 1 addition & 0 deletions neonvm/controllers/vm_controller_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ func newTestParams(t *testing.T) *testParams {
Config: &ReconcilerConfig{
IsK3s: false,
UseContainerMgr: false,
DisableRunnerCgroup: false,
MaxConcurrentReconciles: 10,
SkipUpdateValidationFor: nil,
QEMUDiskCacheSettings: "",
Expand Down
8 changes: 8 additions & 0 deletions neonvm/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ func main() {
var concurrencyLimit int
var skipUpdateValidationFor map[types.NamespacedName]struct{}
var enableContainerMgr bool
var disableRunnerCgroup bool
var qemuDiskCacheSettings string
var defaultMemoryProvider vmv1.MemoryProvider
var memhpAutoMovableRatio string
Expand Down Expand Up @@ -138,7 +139,9 @@ func main() {
return nil
},
)
// note: cannot have both -enable-container-mgr and -disable-runner-cgroup.
flag.BoolVar(&enableContainerMgr, "enable-container-mgr", false, "Enable crictl-based container-mgr alongside each VM")
flag.BoolVar(&disableRunnerCgroup, "disable-runner-cgroup", false, "Disable creation of a cgroup in neonvm-runner for fractional CPU limiting")
flag.StringVar(&qemuDiskCacheSettings, "qemu-disk-cache-settings", "cache=none", "Set neonvm-runner's QEMU disk cache settings")
flag.Func("default-memory-provider", "Set default memory provider to use for new VMs", defaultMemoryProvider.FlagFunc)
flag.StringVar(&memhpAutoMovableRatio, "memhp-auto-movable-ratio", "301", "For virtio-mem, set VM kernel's memory_hotplug.auto_movable_ratio")
Expand All @@ -152,6 +155,10 @@ func main() {
fmt.Fprintln(os.Stderr, "missing required flag '-default-memory-provider'")
os.Exit(1)
}
if disableRunnerCgroup && enableContainerMgr {
fmt.Fprintln(os.Stderr, "Cannot have both '-enable-container-mgr' and '-disable-runner-cgroup'")
os.Exit(1)
}

logConfig := zap.NewProductionConfig()
logConfig.Sampling = nil // Disabling sampling; it's enabled by default for zap's production configs.
Expand Down Expand Up @@ -205,6 +212,7 @@ func main() {
rc := &controllers.ReconcilerConfig{
IsK3s: isK3s,
UseContainerMgr: enableContainerMgr,
DisableRunnerCgroup: disableRunnerCgroup,
MaxConcurrentReconciles: concurrencyLimit,
SkipUpdateValidationFor: skipUpdateValidationFor,
QEMUDiskCacheSettings: qemuDiskCacheSettings,
Expand Down
78 changes: 68 additions & 10 deletions neonvm/runner/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"

Expand All @@ -36,6 +37,7 @@ import (
"github.com/jpillora/backoff"
"github.com/kdomanski/iso9660"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/samber/lo"
"github.com/vishvananda/netlink"
"go.uber.org/zap"

Expand Down Expand Up @@ -594,6 +596,7 @@ type Config struct {
kernelPath string
appendKernelCmdline string
skipCgroupManagement bool
enableDummyCPUServer bool
diskCacheSettings string
memoryProvider vmv1.MemoryProvider
autoMovableRatio string
Expand All @@ -606,6 +609,7 @@ func newConfig(logger *zap.Logger) *Config {
kernelPath: defaultKernelPath,
appendKernelCmdline: "",
skipCgroupManagement: false,
enableDummyCPUServer: false,
diskCacheSettings: "cache=none",
memoryProvider: "", // Require that this is explicitly set. We'll check later.
autoMovableRatio: "", // Require that this is explicitly set IFF memoryProvider is VirtioMem. We'll check later.
Expand All @@ -620,7 +624,10 @@ func newConfig(logger *zap.Logger) *Config {
cfg.appendKernelCmdline, "Additional kernel command line arguments")
flag.BoolVar(&cfg.skipCgroupManagement, "skip-cgroup-management",
cfg.skipCgroupManagement,
"Don't try to manage CPU (use if running alongside container-mgr)")
"Don't try to manage CPU (use if running alongside container-mgr, or if dummy CPU server is enabled)")
flag.BoolVar(&cfg.enableDummyCPUServer, "enable-dummy-cpu-server",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we actually plan to use runner without the enable-dummy-cpu-server?

I don't mind that it is there, but I think I'd run the server unconditionally, until we (likely) decide to abandon it overall.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we actually plan to use runner without the enable-dummy-cpu-server?

For now, yes. We haven't come to a decision beyond that.

cfg.skipCgroupManagement,
"Provide a CPU server (unlike -skip-cgroup-management) but don't actually do anything with it")
flag.StringVar(&cfg.diskCacheSettings, "qemu-disk-cache-settings",
cfg.diskCacheSettings, "Cache settings to add to -drive args for VM disks")
flag.Func("memory-provider", "Set provider for memory hotplug", cfg.memoryProvider.FlagFunc)
Expand All @@ -635,6 +642,9 @@ func newConfig(logger *zap.Logger) *Config {
if cfg.memoryProvider == vmv1.MemoryProviderVirtioMem && cfg.autoMovableRatio == "" {
logger.Fatal("missing required flag '-memhp-auto-movable-ratio'")
}
if cfg.enableDummyCPUServer && !cfg.skipCgroupManagement {
logger.Fatal("flag -enable-dummy-cpu-server requires -skip-cgroup-management")
}

return cfg
}
Expand Down Expand Up @@ -1007,9 +1017,36 @@ func runQEMU(

wg.Add(1)
go terminateQemuOnSigterm(ctx, logger, &wg)
if !cfg.skipCgroupManagement {
if !cfg.skipCgroupManagement || cfg.enableDummyCPUServer {
var callbacks cpuServerCallbacks

if cfg.enableDummyCPUServer {
lastValue := &atomic.Uint32{}
lastValue.Store(uint32(vmSpec.Guest.CPUs.Min))

callbacks = cpuServerCallbacks{
get: func(logger *zap.Logger) (*vmv1.MilliCPU, error) {
return lo.ToPtr(vmv1.MilliCPU(lastValue.Load())), nil
},
set: func(logger *zap.Logger, cpu vmv1.MilliCPU) error {
lastValue.Store(uint32(cpu))
return nil
},
}
} else {
// Standard implementation -- actually set the cgroup
callbacks = cpuServerCallbacks{
get: func(logger *zap.Logger) (*vmv1.MilliCPU, error) {
return getCgroupQuota(cgroupPath)
},
set: func(logger *zap.Logger, cpu vmv1.MilliCPU) error {
return setCgroupLimit(logger, cpu, cgroupPath)
},
}
}

wg.Add(1)
go listenForCPUChanges(ctx, logger, vmSpec.RunnerPort, cgroupPath, &wg)
go listenForCPUChanges(ctx, logger, vmSpec.RunnerPort, callbacks, &wg)
}
wg.Add(1)
go forwardLogs(ctx, logger, &wg)
Expand Down Expand Up @@ -1040,7 +1077,12 @@ func runQEMU(
return err
}

func handleCPUChange(logger *zap.Logger, w http.ResponseWriter, r *http.Request, cgroupPath string) {
func handleCPUChange(
logger *zap.Logger,
w http.ResponseWriter,
r *http.Request,
set func(*zap.Logger, vmv1.MilliCPU) error,
) {
if r.Method != "POST" {
logger.Error("unexpected method", zap.String("method", r.Method))
w.WriteHeader(400)
Expand All @@ -1062,7 +1104,7 @@ func handleCPUChange(logger *zap.Logger, w http.ResponseWriter, r *http.Request,

// update cgroup
logger.Info("got CPU update", zap.Float64("CPU", parsed.VCPUs.AsFloat64()))
err = setCgroupLimit(logger, parsed.VCPUs, cgroupPath)
err = set(logger, parsed.VCPUs)
if err != nil {
logger.Error("could not set cgroup limit", zap.Error(err))
w.WriteHeader(500)
Expand All @@ -1072,14 +1114,19 @@ func handleCPUChange(logger *zap.Logger, w http.ResponseWriter, r *http.Request,
w.WriteHeader(200)
}

func handleCPUCurrent(logger *zap.Logger, w http.ResponseWriter, r *http.Request, cgroupPath string) {
func handleCPUCurrent(
logger *zap.Logger,
w http.ResponseWriter,
r *http.Request,
get func(*zap.Logger) (*vmv1.MilliCPU, error),
) {
if r.Method != "GET" {
logger.Error("unexpected method", zap.String("method", r.Method))
w.WriteHeader(400)
return
}

cpus, err := getCgroupQuota(cgroupPath)
cpus, err := get(logger)
if err != nil {
logger.Error("could not get cgroup quota", zap.Error(err))
w.WriteHeader(500)
Expand All @@ -1097,17 +1144,28 @@ func handleCPUCurrent(logger *zap.Logger, w http.ResponseWriter, r *http.Request
w.Write(body) //nolint:errcheck // Not much to do with the error here. TODO: log it?
}

func listenForCPUChanges(ctx context.Context, logger *zap.Logger, port int32, cgroupPath string, wg *sync.WaitGroup) {
type cpuServerCallbacks struct {
get func(*zap.Logger) (*vmv1.MilliCPU, error)
set func(*zap.Logger, vmv1.MilliCPU) error
}

func listenForCPUChanges(
ctx context.Context,
logger *zap.Logger,
port int32,
callbacks cpuServerCallbacks,
wg *sync.WaitGroup,
) {
defer wg.Done()
mux := http.NewServeMux()
loggerHandlers := logger.Named("http-handlers")
cpuChangeLogger := loggerHandlers.Named("cpu_change")
mux.HandleFunc("/cpu_change", func(w http.ResponseWriter, r *http.Request) {
handleCPUChange(cpuChangeLogger, w, r, cgroupPath)
handleCPUChange(cpuChangeLogger, w, r, callbacks.set)
})
cpuCurrentLogger := loggerHandlers.Named("cpu_current")
mux.HandleFunc("/cpu_current", func(w http.ResponseWriter, r *http.Request) {
handleCPUCurrent(cpuCurrentLogger, w, r, cgroupPath)
handleCPUCurrent(cpuCurrentLogger, w, r, callbacks.get)
})
server := http.Server{
Addr: fmt.Sprintf("0.0.0.0:%d", port),
Expand Down
Loading