Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ws): implement culling controller #63

Open
wants to merge 12 commits into
base: notebooks-v2
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion workspaces/controller/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Image URL to use all building/pushing image targets
IMG ?= controller:latest
IMG ?= ghcr.io/kubeflow/notebooks/workspace-controller
# ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary.
ENVTEST_K8S_VERSION = 1.29.0

Expand Down Expand Up @@ -86,6 +86,9 @@ build: manifests generate fmt vet ## Build manager binary.
run: manifests generate fmt vet ## Run a controller from your host.
go run ./cmd/main.go

kind-load:
kind load docker-image ${IMG} -n kind

# If you wish to build the manager image targeting other platforms you can use the --platform flag.
# (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.
# More info: https://docs.docker.com/develop/develop-images/build_enhancements/
Expand Down
43 changes: 43 additions & 0 deletions workspaces/controller/api/v1beta1/workspace_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ type WorkspaceSpec struct {
//+kubebuilder:default=false
Paused *bool `json:"paused,omitempty"`

// DisableCulling controls whether automatic culling is disabled for the workspace.
// If true, the workspace will not be culled
//+kubebuilder:validation:Optional
//+kubebuilder:default=false
DisableCulling *bool `json:"disableCulling,omitempty"`

// if true, pending updates are NOT applied when the Workspace is paused
// if false, pending updates are applied when the Workspace is paused
//+kubebuilder:validation:Optional
Expand Down Expand Up @@ -187,6 +193,9 @@ type WorkspaceActivity struct {
//+kubebuilder:default=0
//+kubebuilder:example=1704067200
LastUpdate int64 `json:"lastUpdate"`

// Information about the last activity probe
LastProbe ProbeStatus `json:"lastProbe"`
}

type WorkspacePodOptionsStatus struct {
Expand Down Expand Up @@ -221,6 +230,30 @@ type WorkspacePodOptionRedirectStep struct {
Target string `json:"target"`
}

type ProbeStatus struct {

// the time the probe was started (UNIX epoch in milliseconds)
//+kubebuilder:validation:Minimum=0
//+kubebuilder:example=1710435303000
StartTimeMs int64 `json:"startTimeMs"`

// the time the probe was completed (UNIX epoch in milliseconds)
//+kubebuilder:validation:Minimum=0
//+kubebuilder:example=1710435305000
EndTimeMs int64 `json:"endTimeMs"`

// the result of the probe
// ENUM: "Success" | "Failure" | "Timeout" | ""
//+kubebuilder:default=""
Result ProbeResult `json:"result"`

// a human-readable message about the probe result
// WARNING: this field is NOT FOR MACHINE USE, subject to change without notice
//+kubebuilder:default=""
//+kubebuilder:example="Jupyter probe succeeded"
Message string `json:"message"`
}

// +kubebuilder:validation:Enum:={"Running","Terminating","Paused","Pending","Error","Unknown"}
type WorkspaceState string

Expand All @@ -233,6 +266,15 @@ const (
WorkspaceStateUnknown WorkspaceState = "Unknown"
)

// +kubebuilder:validation:Enum={"Success","Failure","Timeout",""}
type ProbeResult string

const (
ProbeResultSuccess ProbeResult = "Success"
ProbeResultFailure ProbeResult = "Failure"
ProbeResultTimeout ProbeResult = "Timeout"
)

/*
===============================================================================
Workspace
Expand All @@ -242,6 +284,7 @@ const (
//+kubebuilder:object:root=true
//+kubebuilder:printcolumn:name="State",type="string",JSONPath=".status.state",description="The current state of the Workspace"
//+kubebuilder:subresource:status
//+kubebuilder:resource:shortName=ws

// Workspace is the Schema for the Workspaces API
type Workspace struct {
Expand Down
40 changes: 35 additions & 5 deletions workspaces/controller/api/v1beta1/workspacekind_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,18 @@ type WorkspaceKindCullingConfig struct {
//+kubebuilder:default=86400
MaxInactiveSeconds *int32 `json:"maxInactiveSeconds,omitempty"`

// the maximum number of seconds between probes
//+kubebuilder:validation:Optional
//+kubebuilder:validation:Minimum:=60
//+kubebuilder:default=300
MaxProbeIntervalSeconds *int32 `json:"maxProbeIntervalSeconds,omitempty"`

// the minimum number of seconds between probes to avoid spamming in case on failure
//+kubebuilder:validation:Optional
//+kubebuilder:validation:Minimum:=10
//+kubebuilder:default=20
MinProbeIntervalSeconds *int32 `json:"minProbeIntervalSeconds,omitempty"`

// the probe used to determine if the Workspace is active
ActivityProbe ActivityProbe `json:"activityProbe"`
}
Expand All @@ -205,17 +217,35 @@ type ActivityProbe struct {
}

type ActivityProbeExec struct {
// the command to run
//+kubebuilder:validation:MinItems:=1
//+kubebuilder:example={"bash", "-c", "exit 0"}
Command []string `json:"command"`
// the script should write a JSON file at this path.
// any existing file in this path will be REMOVED before the script is run
//+kubebuilder:example="/tmp/activity_probe.json"
OutputPath string `json:"outputPath"`

// the number of seconds to wait for the script to complete
//+kubebuilder:validation:Minimum:=1
//+kubebuilder:validation:Maximum:=300
//+kubebuilder:default=10
TimeoutSeconds int32 `json:"timeoutSeconds"`

// the script to run to determine if the Workspace is active
// - the script must exit with a 0 status code unless there is an error
// - workspaces with failing activity probes will NOT be culled
// - the script must have a shebang (e.g. `#!/usr/bin/env bash` or `#!/usr/bin/env python`)
// - the script should be idempotent and without side effects, it may be run multiple times
// - typically, it will be more efficient to write a probe which checks for a specific
// activity indicator agreed with your users, rather than checking the entire filesystem
Script string `json:"script"`
}

// +kubebuilder:validation:XValidation:message="'lastActivity' must be true",rule="has(self.lastActivity) && self.lastActivity"
type ActivityProbeJupyter struct {
// if the Jupyter-specific probe is enabled
//+kubebuilder:example=true
LastActivity bool `json:"lastActivity"`

// The ID of the port used for probing Jupyter via HTTP requests.
PortId string `json:"portId"`
}

type WorkspaceKindProbes struct {
Expand Down Expand Up @@ -547,7 +577,7 @@ type OptionMetric struct {
//+kubebuilder:printcolumn:name="Deprecated",type="boolean",JSONPath=".spec.spawner.deprecated",description="If this WorkspaceKind is deprecated"
//+kubebuilder:printcolumn:name="Hidden",type="boolean",JSONPath=".spec.spawner.hidden",description="If this WorkspaceKind is hidden from the spawner UI"
//+kubebuilder:subresource:status
//+kubebuilder:resource:scope=Cluster
//+kubebuilder:resource:scope=Cluster,shortName=wsk

// WorkspaceKind is the Schema for the WorkspaceKinds API
type WorkspaceKind struct {
Expand Down
38 changes: 32 additions & 6 deletions workspaces/controller/api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 15 additions & 1 deletion workspaces/controller/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ package main
import (
"crypto/tls"
"flag"
"k8s.io/client-go/kubernetes"
"os"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
_ "k8s.io/client-go/plugin/pkg/client/auth"
Expand Down Expand Up @@ -147,6 +147,20 @@ func main() {
setupLog.Error(err, "unable to create controller", "controller", "WorkspaceKind")
os.Exit(1)
}
clientset, err := kubernetes.NewForConfig(mgr.GetConfig())
if err != nil {
setupLog.Error(err, "unable to create clientset")
os.Exit(1)
}
if err = (&controllerInternal.CullingReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Config: mgr.GetConfig(),
ClientSet: clientset,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Culler")
os.Exit(1)
}
//+kubebuilder:scaffold:builder

if os.Getenv("ENABLE_WEBHOOKS") != "false" {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ spec:
kind: WorkspaceKind
listKind: WorkspaceKindList
plural: workspacekinds
shortNames:
- wsk
singular: workspacekind
scope: Cluster
versions:
Expand Down Expand Up @@ -236,18 +238,34 @@ spec:
- if the Workspace had activity in the last 60 seconds this command
should return status 0, otherwise it should return status 1
properties:
command:
description: the command to run
example:
- bash
- -c
- exit 0
items:
type: string
minItems: 1
type: array
outputPath:
description: |-
the script should write a JSON file at this path.
any existing file in this path will be REMOVED before the script is run
example: /tmp/activity_probe.json
type: string
script:
description: |-
the script to run to determine if the Workspace is active
- the script must exit with a 0 status code unless there is an error
- workspaces with failing activity probes will NOT be culled
- the script must have a shebang (e.g. `#!/usr/bin/env bash` or `#!/usr/bin/env python`)
- the script should be idempotent and without side effects, it may be run multiple times
- typically, it will be more efficient to write a probe which checks for a specific
activity indicator agreed with your users, rather than checking the entire filesystem
type: string
timeoutSeconds:
default: 10
description: the number of seconds to wait for the
script to complete
format: int32
maximum: 300
minimum: 1
type: integer
required:
- command
- outputPath
- script
- timeoutSeconds
type: object
jupyter:
description: |-
Expand All @@ -260,8 +278,13 @@ spec:
description: if the Jupyter-specific probe is enabled
example: true
type: boolean
portId:
description: The ID of the port used for probing Jupyter
via HTTP requests.
type: string
required:
- lastActivity
- portId
type: object
x-kubernetes-validations:
- message: '''lastActivity'' must be true'
Expand All @@ -282,6 +305,19 @@ spec:
format: int32
minimum: 60
type: integer
maxProbeIntervalSeconds:
default: 300
description: the maximum number of seconds between probes
format: int32
minimum: 60
type: integer
minProbeIntervalSeconds:
default: 20
description: the minimum number of seconds between probes
to avoid spamming in case on failure
format: int32
minimum: 10
type: integer
required:
- activityProbe
type: object
Expand Down
Loading