Skip to content

Commit

Permalink
fix: gpu limits, priority class and https in redirect (#556)
Browse files Browse the repository at this point in the history
* fix: always use https in oauth redirect url

* fix: always set limits for gpus

Without a limit for gpus on AWS we cannot schedule sessions.

* fix: add priority class to sessions
  • Loading branch information
olevski committed Dec 2, 2024
1 parent 0c8096a commit 3ee24fe
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 9 deletions.
9 changes: 8 additions & 1 deletion components/renku_data_services/crc/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ class ResourceClass(ResourcesCompareMixin):
matching: Optional[bool] = None
node_affinities: list[NodeAffinity] = field(default_factory=list)
tolerations: list[str] = field(default_factory=list)
quota: str | None = None

def __post_init__(self) -> None:
if len(self.name) > 40:
Expand All @@ -109,14 +110,20 @@ def from_dict(cls, data: dict) -> "ResourceClass":
"""Create the model from a plain dictionary."""
node_affinities: list[NodeAffinity] = []
tolerations: list[str] = []
quota: str | None = None
if data.get("node_affinities"):
node_affinities = [
NodeAffinity.from_dict(affinity) if isinstance(affinity, dict) else affinity
for affinity in data.get("node_affinities", [])
]
if isinstance(data.get("tolerations"), list):
tolerations = [toleration for toleration in data["tolerations"]]
return cls(**{**data, "tolerations": tolerations, "node_affinities": node_affinities})
if data_quota := data.get("quota"):
if isinstance(data_quota, str):
quota = data_quota
elif isinstance(data_quota, Quota):
quota = data_quota.id
return cls(**{**data, "tolerations": tolerations, "node_affinities": node_affinities, "quota": quota})

def is_quota_valid(self, quota: "Quota") -> bool:
"""Determine if a quota is compatible with the resource class."""
Expand Down
5 changes: 4 additions & 1 deletion components/renku_data_services/crc/orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ class ResourceClassORM(BaseORM):
resource_pool_id: Mapped[Optional[int]] = mapped_column(
ForeignKey("resource_pools.id", ondelete="CASCADE"), default=None, index=True
)
resource_pool: Mapped[Optional["ResourcePoolORM"]] = relationship(back_populates="classes", default=None)
resource_pool: Mapped[Optional["ResourcePoolORM"]] = relationship(
back_populates="classes", default=None, lazy="joined"
)
id: Mapped[int] = mapped_column(Integer, Identity(always=True), primary_key=True, default=None, init=False)
tolerations: Mapped[list["TolerationORM"]] = relationship(
back_populates="resource_class",
Expand Down Expand Up @@ -127,6 +129,7 @@ def dump(self, matching_criteria: models.ResourceClass | None = None) -> models.
node_affinities=[affinity.dump() for affinity in self.node_affinities],
tolerations=[toleration.key for toleration in self.tolerations],
matching=matching,
quota=self.resource_pool.quota if self.resource_pool else None,
)


Expand Down
22 changes: 18 additions & 4 deletions components/renku_data_services/notebooks/blueprints.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,9 +291,18 @@ async def _handler(
default_resource_class = await self.rp_repo.get_default_resource_class()
if default_resource_class.id is None:
raise errors.ProgrammingError(message="The default resource class has to have an ID", quiet=True)
resource_class_id = body.resource_class_id or default_resource_class.id
resource_class_id: int
quota: str | None = None
if body.resource_class_id is None:
resource_class = await self.rp_repo.get_default_resource_class()
# TODO: Add types for saved and unsaved resource class
resource_class_id = cast(int, resource_class.id)
else:
resource_class = await self.rp_repo.get_resource_class(user, resource_class_id)
# TODO: Add types for saved and unsaved resource class
resource_class_id = cast(int, resource_class.id)
quota = resource_class.quota
await self.nb_config.crc_validator.validate_class_storage(user, resource_class_id, body.disk_storage)
resource_class = await self.rp_repo.get_resource_class(user, resource_class_id)
work_dir_fallback = PurePosixPath("/home/jovyan")
work_dir = environment.working_directory or image_workdir or work_dir_fallback
storage_mount_fallback = work_dir / "work"
Expand Down Expand Up @@ -411,6 +420,7 @@ async def _handler(
)

base_server_url = self.nb_config.sessions.ingress.base_url(server_name)
base_server_https_url = self.nb_config.sessions.ingress.base_url(server_name, force_https=True)
base_server_path = self.nb_config.sessions.ingress.base_path(server_name)
ui_path: str = (
f"{base_server_path.rstrip("/")}/{environment.default_url.lstrip("/")}"
Expand All @@ -426,9 +436,11 @@ async def _handler(
"cpu": str(round(resource_class.cpu * 1000)) + "m",
"memory": f"{resource_class.memory}Gi",
}
limits: dict[str, str | int] = {}
if resource_class.gpu > 0:
gpu_name = GpuKind.NVIDIA.value + "/gpu"
requests[gpu_name] = resource_class.gpu
limits[gpu_name] = resource_class.gpu
tolerations = [
Toleration.model_validate(toleration) for toleration in self.nb_config.sessions.tolerations
] + tolerations_from_resource_class(resource_class)
Expand All @@ -444,6 +456,7 @@ async def _handler(
codeRepositories=[],
hibernated=False,
reconcileStrategy=ReconcileStrategy.whenFailedOrHibernated,
priorityClassName=quota,
session=Session(
image=image,
urlPath=ui_path,
Expand All @@ -458,7 +471,7 @@ async def _handler(
workingDir=work_dir.as_posix(),
runAsUser=environment.uid,
runAsGroup=environment.gid,
resources=Resources(requests=requests),
resources=Resources(requests=requests, limits=limits if len(limits) > 0 else None),
extraVolumeMounts=[],
command=environment.command,
args=environment.args,
Expand Down Expand Up @@ -521,7 +534,8 @@ async def _handler(
"oidc_issuer_url": self.nb_config.sessions.oidc.issuer_url,
"session_cookie_minimal": True,
"skip_provider_button": True,
"redirect_url": urljoin(base_server_url + "/", "oauth2/callback"),
# NOTE: If the redirect url is not HTTPS then some or identity providers will fail.
"redirect_url": urljoin(base_server_https_url + "/", "oauth2/callback"),
"cookie_path": base_server_path,
"proxy_prefix": parsed_proxy_url.path,
"authenticated_emails_file": "/authorized_emails",
Expand Down
4 changes: 3 additions & 1 deletion components/renku_data_services/notebooks/config/dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,10 @@ def from_env(cls) -> Self:
def base_path(self, server_name: str) -> str:
return f"/sessions/{server_name}"

def base_url(self, server_name: str) -> str:
def base_url(self, server_name: str, force_https: bool = False) -> str:
scheme = "https" if self.tls_secret else "http"
if force_https:
scheme = "https"
return urlunparse((scheme, self.host, self.base_path(server_name), None, None, None))


Expand Down
22 changes: 21 additions & 1 deletion components/renku_data_services/notebooks/cr_amalthea_session.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# generated by datamodel-codegen:
# filename: <stdin>
# timestamp: 2024-11-13T08:52:55+00:00
# timestamp: 2024-11-29T10:46:16+00:00

from __future__ import annotations

Expand Down Expand Up @@ -3072,6 +3072,19 @@ class EnvItem2(BaseCRD):
)


class Type3(Enum):
none = "none"
tcp = "tcp"
http = "http"


class ReadinessProbe2(BaseCRD):
model_config = ConfigDict(
extra="allow",
)
type: Type3 = Field(default="tcp", description="The type of readiness probe")


class Resources3(BaseCRD):
model_config = ConfigDict(
extra="allow",
Expand Down Expand Up @@ -3125,6 +3138,9 @@ class Session(BaseCRD):
gt=0,
lt=65535,
)
readinessProbe: ReadinessProbe2 = Field(
default={}, description="The readiness probe to use on the session container"
)
resources: Optional[Resources3] = Field(
default=None,
description="Resource requirements and limits in the same format as a Pod in Kubernetes",
Expand Down Expand Up @@ -3221,6 +3237,10 @@ class Spec(BaseCRD):
default=None,
description="Selector which must match a node's labels for the pod to be scheduled on that node.\nPassed right through to the Statefulset used for the session.",
)
priorityClassName: Optional[str] = Field(
default=None,
description="The name of the priority class assigned to the session Pod.",
)
reconcileStrategy: ReconcileStrategy = Field(
default="always",
description="Indicates how Amalthea should reconcile the child resources for a session. This can be problematic because\nnewer versions of Amalthea may include new versions of the sidecars or other changes not reflected\nin the AmaltheaSession CRD, so simply updating Amalthea could cause existing sessions to restart\nbecause the sidecars will have a newer image or for other reasons because the code changed.\nHibernating the session and deleting it will always work as expected regardless of the strategy.\nThe status of the session and all hibernation or auto-cleanup functionality will always work as expected.\nA few values are possible:\n- never: Amalthea will never update any of the child resources and will ignore any changes to the CR\n- always: This is the expected method of operation for an operator, changes to the spec are always reconciled\n- whenHibernatedOrFailed: To avoid interrupting a running session, reconciliation of the child components\n are only done when the session has a Failed or Hibernated status",
Expand Down
6 changes: 5 additions & 1 deletion test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,11 +327,15 @@ def remove_id_from_rc(rc: rp_models.ResourceClass) -> rp_models.ResourceClass:
return rp_models.ResourceClass.from_dict(kwargs)


def remove_quota_from_rc(rc: rp_models.ResourceClass) -> rp_models.ResourceClass:
return rc.update(quota=None)


def remove_id_from_rp(rp: rp_models.ResourcePool) -> rp_models.ResourcePool:
quota = rp.quota
if isinstance(quota, rp_models.Quota):
quota = remove_id_from_quota(quota)
classes = [remove_id_from_rc(rc) for rc in rp.classes]
classes = [remove_quota_from_rc(remove_id_from_rc(rc)) for rc in rp.classes]
return rp_models.ResourcePool(
name=rp.name,
id=None,
Expand Down

0 comments on commit 3ee24fe

Please sign in to comment.