Skip to content

Commit

Permalink
handle possible errors in sync_container_lifecycle()
Browse files Browse the repository at this point in the history
  • Loading branch information
fregataa committed Jun 17, 2024
1 parent fd42bfb commit 331b744
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions src/ai/backend/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -1274,6 +1274,16 @@ async def sync_container_lifecycles(self, interval: float) -> None:
own_kernels: dict[KernelId, ContainerId] = {}
terminated_kernels: dict[KernelId, ContainerLifecycleEvent] = {}

def _get_session_id(container: Container) -> SessionId | None:
_session_id = container.labels.get("ai.backend.session-id")
try:
return SessionId(UUID(_session_id))
except ValueError:
log.warning(
f"sync_container_lifecycles() invalid session-id (cid: {container.id}, sid:{_session_id})"
)
return None

try:
_containers = await self.enumerate_containers(ACTIVE_STATUS_SET | DEAD_STATUS_SET)
async with self.registry_lock:
Expand All @@ -1292,7 +1302,9 @@ async def sync_container_lifecycles(self, interval: float) -> None:
kernel_id,
container.id,
)
session_id = SessionId(UUID(container.labels["ai.backend.session-id"]))
session_id = _get_session_id(container)
if session_id is None:
continue
terminated_kernels[kernel_id] = ContainerLifecycleEvent(
kernel_id,
session_id,
Expand All @@ -1307,7 +1319,9 @@ async def sync_container_lifecycles(self, interval: float) -> None:
]
for kernel_id, container in active_containers:
alive_kernels[kernel_id] = container.id
session_id = SessionId(UUID(container.labels["ai.backend.session-id"]))
session_id = _get_session_id(container)
if session_id is None:
continue
kernel_session_map[kernel_id] = session_id
own_kernels[kernel_id] = container.id
for kernel_id, kernel_obj in self.kernel_registry.items():
Expand Down

0 comments on commit 331b744

Please sign in to comment.