From bbd10b28b11d0ad9305123e21572c2960f2ba197 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Fri, 7 Jun 2024 12:15:41 +0900 Subject: [PATCH] enhance: fetch containers eagerly when sync containers --- src/ai/backend/agent/agent.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index fd7cea9635..ab9c156a15 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -1265,10 +1265,17 @@ async def sync_container_lifecycles(self, interval: float) -> None: own_kernels: dict[KernelId, ContainerId] = {} terminated_kernels: dict[KernelId, ContainerLifecycleEvent] = {} + _containers = await self.enumerate_containers(DEAD_STATUS_SET | ACTIVE_STATUS_SET) + async with self.registry_lock: try: # Check if: there are dead containers - for kernel_id, container in await self.enumerate_containers(DEAD_STATUS_SET): + dead_containers = [ + (kid, container) + for kid, container in _containers + if container.status in DEAD_STATUS_SET + ] + for kernel_id, container in dead_containers: if kernel_id in self.restarting_kernels: continue log.info( @@ -1284,7 +1291,12 @@ async def sync_container_lifecycles(self, interval: float) -> None: LifecycleEvent.CLEAN, KernelLifecycleEventReason.SELF_TERMINATED, ) - for kernel_id, container in await self.enumerate_containers(ACTIVE_STATUS_SET): + alive_containers = [ + (kid, container) + for kid, container in _containers + if container.status in ACTIVE_STATUS_SET + ] + for kernel_id, container in alive_containers: alive_kernels[kernel_id] = container.id session_id = SessionId(UUID(container.labels["ai.backend.session-id"])) kernel_session_map[kernel_id] = session_id