Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

neonvm: revert crictl patches #1054

Merged
merged 6 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion neonvm/apis/neonvm/v1/virtualmachine_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ func (r *VirtualMachine) ValidateCreate() (admission.Warnings, error) {
"runtime",
"swapdisk",
"sysfscgroup",
"containerdsock",
"ssh-privatekey",
"ssh-publickey",
"ssh-authorized-keys",
Expand Down
2 changes: 1 addition & 1 deletion neonvm/config/controller/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ spec:
- "--leader-elect"
- "--concurrency-limit=128"
- "--skip-update-validation-for="
- "--enable-container-mgr"
- "--disable-runner-cgroup"
# See #775 and its links.
# * cache.writeback=on - don't set O_DSYNC (don't flush every write)
# * cache.direct=on - use O_DIRECT (don't abuse host's page cache!)
Expand Down
6 changes: 0 additions & 6 deletions neonvm/config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@ rules:
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- nodes
verbs:
- list
- apiGroups:
- ""
resources:
Expand Down
20 changes: 0 additions & 20 deletions neonvm/controllers/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,6 @@ import (
// ReconcilerConfig stores shared configuration for VirtualMachineReconciler and
// VirtualMachineMigrationReconciler.
type ReconcilerConfig struct {
// IsK3s is true iff the cluster is running k3s nodes.
//
// This is required because - unlike the other most common kubernetes distributions - k3s
// changes the location of the containerd socket.
// There unfortunately does not appear to be a way to disable this behavior.
IsK3s bool

// UseContainerMgr, if true, enables using container-mgr for new VM runner pods.
//
// This is defined as a config option so we can do a gradual rollout of this change.
UseContainerMgr bool

// DisableRunnerCgroup, if true, disables running QEMU in a cgroup in new VM runner pods.
// Fractional CPU scaling will continue to *pretend* to work, but it will not do anything in
// practice.
Expand Down Expand Up @@ -63,11 +51,3 @@ type ReconcilerConfig struct {
// updates of metrics and logs, related to failing reconciliations
FailingRefreshInterval time.Duration
}

func (c *ReconcilerConfig) criEndpointSocketPath() string {
if c.IsK3s {
return "/run/k3s/containerd/containerd.sock"
} else {
return "/run/containerd/containerd.sock"
}
}
2 changes: 0 additions & 2 deletions neonvm/controllers/functests/vm_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,6 @@ var _ = Describe("VirtualMachine controller", func() {
Scheme: k8sClient.Scheme(),
Recorder: nil,
Config: &controllers.ReconcilerConfig{
IsK3s: false,
UseContainerMgr: true,
DisableRunnerCgroup: false,
MaxConcurrentReconciles: 1,
SkipUpdateValidationFor: nil,
Expand Down
134 changes: 7 additions & 127 deletions neonvm/controllers/vm_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apiserver/pkg/storage/names"
"k8s.io/client-go/tools/record"

Expand Down Expand Up @@ -94,7 +93,6 @@ type VMReconciler struct {
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachines/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachines/finalizers,verbs=update
//+kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
//+kubebuilder:rbac:groups=core,resources=nodes,verbs=list
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;list;watch
Expand Down Expand Up @@ -934,20 +932,8 @@ const (

// runnerStatus returns a description of the status of the VM inside the runner pod.
//
// This is *similar* to the value of pod.Status.Phase, but takes into consideration the statuses of
// the individual containers within the pod. This is because Kubernetes sets the pod phase to Failed
// or Succeeded only if *all* pods have exited, whereas we'd like to consider the VM to be Failed or
// Succeeded if *any* pod has exited.
//
// The full set of outputs is:
//
// - runnerUnknown, if pod.Status.Phase is Unknown
// - runnerPending, if pod.Status.Phase is "" or Pending
// - runnerRunning, if pod.Status.Phase is Running, and no containers have exited
// - runnerFailed, if pod.Status.Phase is Failed, or if any container has failed, or if any
// container other than neonvm-runner has exited
// - runnerSucceeded, if pod.Status.Phase is Succeeded, or if neonvm-runner has exited
// successfully
// This is *similar* to the value of pod.Status.Phase, but we'd like to retain our own abstraction
// to have more control over the semantics.
func runnerStatus(pod *corev1.Pod) runnerStatusKind {
switch pod.Status.Phase {
case "", corev1.PodPending:
Expand All @@ -958,43 +944,8 @@ func runnerStatus(pod *corev1.Pod) runnerStatusKind {
return runnerFailed
case corev1.PodUnknown:
return runnerUnknown

// See comment above for context on this logic
case corev1.PodRunning:
nonRunnerContainerSucceeded := false
runnerContainerSucceeded := false

for _, stat := range pod.Status.ContainerStatuses {
if stat.State.Terminated != nil {
failed := stat.State.Terminated.ExitCode != 0
isRunner := stat.Name == "neonvm-runner"

if failed {
// return that the "runner" has failed if any container has.
return runnerFailed
} else /* succeeded */ {
if isRunner {
// neonvm-runner succeeded. We'll return runnerSucceeded if no other
// container has failed.
runnerContainerSucceeded = true
} else {
// Other container has succeeded. We'll return runnerSucceeded if
// neonvm-runner has succeeded, but runnerFailed if this exited while
// neonvm-runner is still going.
nonRunnerContainerSucceeded = true
}
}
}
}

if runnerContainerSucceeded {
return runnerSucceeded
} else if nonRunnerContainerSucceeded {
return runnerFailed
} else {
return runnerRunning
}

return runnerRunning
default:
panic(fmt.Errorf("unknown pod phase: %q", pod.Status.Phase))
}
Expand Down Expand Up @@ -1486,10 +1437,8 @@ func podSpec(
}},
Command: func() []string {
cmd := []string{"runner"}
if config.UseContainerMgr || config.DisableRunnerCgroup {
cmd = append(cmd, "-skip-cgroup-management")
}
if config.DisableRunnerCgroup {
cmd = append(cmd, "-skip-cgroup-management")
// cgroup management disabled, but we still need something to provide
// the server, so the runner will just provide a dummy implementation.
cmd = append(cmd, "-enable-dummy-cpu-server")
Expand Down Expand Up @@ -1535,7 +1484,7 @@ func podSpec(
MountPropagation: lo.ToPtr(corev1.MountPropagationNone),
}

if config.UseContainerMgr || config.DisableRunnerCgroup {
if config.DisableRunnerCgroup {
return []corev1.VolumeMount{images}
} else {
// the /sys/fs/cgroup mount is only necessary if neonvm-runner has to
Expand All @@ -1545,62 +1494,8 @@ func podSpec(
}(),
Resources: vm.Spec.PodResources,
}
containerMgr := corev1.Container{
Image: image,
Name: "neonvm-container-mgr",
Command: []string{
"container-mgr",
"-port", strconv.Itoa(int(vm.Spec.RunnerPort)),
"-init-milli-cpu", strconv.Itoa(int(vm.Spec.Guest.CPUs.Use)),
},
Env: []corev1.EnvVar{
{
Name: "K8S_POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
{
Name: "CRI_ENDPOINT",
Value: fmt.Sprintf("unix://%s", config.criEndpointSocketPath()),
},
},
LivenessProbe: &corev1.Probe{
InitialDelaySeconds: 10,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/healthz",
Port: intstr.FromInt(int(vm.Spec.RunnerPort)),
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("50m"),
corev1.ResourceMemory: resource.MustParse("50Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"), // cpu limit > request, because usage is spiky
corev1.ResourceMemory: resource.MustParse("50Mi"),
},
},
// socket for crictl to connect to
VolumeMounts: []corev1.VolumeMount{
{
Name: "containerdsock",
MountPath: config.criEndpointSocketPath(),
},
},
}

if config.UseContainerMgr {
return []corev1.Container{runner, containerMgr}
} else {
// Return only the runner if we aren't supposed to use container-mgr
return []corev1.Container{runner}
}
return []corev1.Container{runner}
}(),
Volumes: func() []corev1.Volume {
images := corev1.Volume{
Expand All @@ -1618,19 +1513,7 @@ func podSpec(
},
},
}
containerdSock := corev1.Volume{
Name: "containerdsock",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: config.criEndpointSocketPath(),
Type: lo.ToPtr(corev1.HostPathSocket),
},
},
}

if config.UseContainerMgr {
return []corev1.Volume{images, containerdSock}
} else if config.DisableRunnerCgroup {
if config.DisableRunnerCgroup {
return []corev1.Volume{images}
} else {
return []corev1.Volume{images, cgroup}
Expand Down Expand Up @@ -1687,9 +1570,6 @@ func podSpec(
// If a custom neonvm-runner image is requested, use that instead:
if vm.Spec.RunnerImage != nil {
pod.Spec.Containers[0].Image = *vm.Spec.RunnerImage
if config.UseContainerMgr {
pod.Spec.Containers[1].Image = *vm.Spec.RunnerImage
}
}

// If a custom kernel is used, add that image:
Expand Down
2 changes: 0 additions & 2 deletions neonvm/controllers/vm_controller_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,6 @@ func newTestParams(t *testing.T) *testParams {
Recorder: params.mockRecorder,
Scheme: scheme,
Config: &ReconcilerConfig{
IsK3s: false,
UseContainerMgr: false,
DisableRunnerCgroup: false,
MaxConcurrentReconciles: 10,
SkipUpdateValidationFor: nil,
Expand Down
42 changes: 0 additions & 42 deletions neonvm/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,13 @@ import (
"sigs.k8s.io/controller-runtime/pkg/manager"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/client-go/kubernetes"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
_ "k8s.io/client-go/plugin/pkg/client/auth"
"k8s.io/client-go/rest"
"k8s.io/klog/v2"

vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
Expand Down Expand Up @@ -97,7 +94,6 @@ func main() {
var probeAddr string
var concurrencyLimit int
var skipUpdateValidationFor map[types.NamespacedName]struct{}
var enableContainerMgr bool
var disableRunnerCgroup bool
var qemuDiskCacheSettings string
var defaultMemoryProvider vmv1.MemoryProvider
Expand Down Expand Up @@ -136,8 +132,6 @@ func main() {
return nil
},
)
// note: cannot have both -enable-container-mgr and -disable-runner-cgroup.
flag.BoolVar(&enableContainerMgr, "enable-container-mgr", false, "Enable crictl-based container-mgr alongside each VM")
flag.BoolVar(&disableRunnerCgroup, "disable-runner-cgroup", false, "Disable creation of a cgroup in neonvm-runner for fractional CPU limiting")
flag.StringVar(&qemuDiskCacheSettings, "qemu-disk-cache-settings", "cache=none", "Set neonvm-runner's QEMU disk cache settings")
flag.Func("default-memory-provider", "Set default memory provider to use for new VMs", defaultMemoryProvider.FlagFunc)
Expand All @@ -152,10 +146,6 @@ func main() {
fmt.Fprintln(os.Stderr, "missing required flag '-default-memory-provider'")
os.Exit(1)
}
if disableRunnerCgroup && enableContainerMgr {
fmt.Fprintln(os.Stderr, "Cannot have both '-enable-container-mgr' and '-disable-runner-cgroup'")
os.Exit(1)
}

logConfig := zap.NewProductionConfig()
logConfig.Sampling = nil // Disabling sampling; it's enabled by default for zap's production configs.
Expand All @@ -171,14 +161,6 @@ func main() {
cfg := ctrl.GetConfigOrDie()
cfg.QPS = 1000
cfg.Burst = 2000

// fetch node info to determine if we're running in k3s
isK3s, err := checkIfRunningInK3sCluster(cfg)
if err != nil {
setupLog.Error(err, "unable to check if running in k3s")
os.Exit(1)
}

mgr, err := ctrl.NewManager(cfg, ctrl.Options{
Scheme: scheme,
Metrics: metricsserver.Options{
Expand All @@ -205,8 +187,6 @@ func main() {
reconcilerMetrics := controllers.MakeReconcilerMetrics()

rc := &controllers.ReconcilerConfig{
IsK3s: isK3s,
UseContainerMgr: enableContainerMgr,
DisableRunnerCgroup: disableRunnerCgroup,
MaxConcurrentReconciles: concurrencyLimit,
SkipUpdateValidationFor: skipUpdateValidationFor,
Expand Down Expand Up @@ -287,28 +267,6 @@ func main() {
}
}

func checkIfRunningInK3sCluster(cfg *rest.Config) (bool, error) {
client, err := kubernetes.NewForConfig(cfg)
if err != nil {
return false, fmt.Errorf("failed to create new k8s client: %w", err)
}

ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second)
defer cancel()

nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return false, err
}

for _, node := range nodes.Items {
if strings.HasPrefix(node.Status.NodeInfo.OSImage, "K3s") {
return true, nil
}
}
return false, nil
}

func debugServerFunc(reconcilers ...controllers.ReconcilerWithMetrics) manager.RunnableFunc {
return manager.RunnableFunc(func(ctx context.Context) error {
mux := http.NewServeMux()
Expand Down
Loading
Loading