diff --git a/src/ai/backend/agent/server.py b/src/ai/backend/agent/server.py index 128bd1fd97..d3454ff72c 100644 --- a/src/ai/backend/agent/server.py +++ b/src/ai/backend/agent/server.py @@ -528,7 +528,7 @@ async def _pull(reporter: ProgressReporter, *, img_conf: ImageConfig) -> None: img_ref, img_conf["registry"], timeout=image_pull_timeout ) except asyncio.TimeoutError: - log.exception(f"Image pull timeout after {image_pull_timeout} sec") + log.exception(f"Image pull timeout (img:{str(img_ref)},s:{image_pull_timeout})") await self.agent.produce_event( ImagePullFailedEvent( image=str(img_ref), @@ -537,7 +537,7 @@ async def _pull(reporter: ProgressReporter, *, img_conf: ImageConfig) -> None: ) ) except Exception as e: - log.exception(f"Image pull failed (e:{repr(e)})") + log.exception(f"Image pull failed (img:{str(img_ref)},e:{repr(e)})") await self.agent.produce_event( ImagePullFailedEvent( image=str(img_ref), @@ -546,6 +546,7 @@ async def _pull(reporter: ProgressReporter, *, img_conf: ImageConfig) -> None: ) ) else: + log.info(f"Image pull succeeded {str(img_ref)}") await self.agent.produce_event( ImagePullFinishedEvent( image=str(img_ref), @@ -553,6 +554,7 @@ async def _pull(reporter: ProgressReporter, *, img_conf: ImageConfig) -> None: ) ) else: + log.debug(f"No need to pull image {str(img_ref)}") await self.agent.produce_event( ImagePullFinishedEvent( image=str(img_ref), diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index a60773829e..b8cf91be58 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -295,6 +295,11 @@ class SessionStatus(enum.StrEnum): SessionStatus.ERROR, SessionStatus.CANCELLED, }, + SessionStatus.CREATING: { + SessionStatus.RUNNING, + SessionStatus.ERROR, + SessionStatus.CANCELLED, + }, SessionStatus.RUNNING: { SessionStatus.RESTARTING, SessionStatus.RUNNING_DEGRADED, @@ -373,12 +378,7 @@ def determine_session_status_by_kernels(kernels: Sequence[KernelRow]) -> Session return SessionStatus.ERROR case SessionStatus.PULLING: match k.status: - case ( - KernelStatus.PULLING - | KernelStatus.PREPARING - | KernelStatus.PREPARED - | KernelStatus.RUNNING - ): + case KernelStatus.PULLING | KernelStatus.PREPARING | KernelStatus.PREPARED: continue case KernelStatus.CANCELLED: candidate = SessionStatus.CANCELLED @@ -403,6 +403,7 @@ def determine_session_status_by_kernels(kernels: Sequence[KernelRow]) -> Session case KernelStatus.CANCELLED: candidate = SessionStatus.CANCELLED case _: + # Set status to ERROR if any kernel is in exceptional state return SessionStatus.ERROR case SessionStatus.CANCELLED: match k.status: @@ -410,10 +411,9 @@ def determine_session_status_by_kernels(kernels: Sequence[KernelRow]) -> Session KernelStatus.CANCELLED | KernelStatus.PENDING | KernelStatus.SCHEDULED - | KernelStatus.PULLING | KernelStatus.PREPARING + | KernelStatus.PULLING | KernelStatus.PREPARED - | KernelStatus.CREATING ): continue case _: @@ -434,10 +434,10 @@ def determine_session_status_by_kernels(kernels: Sequence[KernelRow]) -> Session return SessionStatus.ERROR case SessionStatus.TERMINATED: match k.status: - case KernelStatus.TERMINATING: - candidate = SessionStatus.TERMINATING case KernelStatus.TERMINATED: continue + case KernelStatus.TERMINATING: + candidate = SessionStatus.TERMINATING case _: return SessionStatus.ERROR case SessionStatus.RESTARTING | SessionStatus.RUNNING_DEGRADED: diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index cef89e1ce3..0a8d188349 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -3280,7 +3280,7 @@ async def _transit(db_session: AsyncSession) -> set[SessionId]: .where( (KernelRow.image == image) & (KernelRow.agent == agent_id) - & (KernelRow.status.in_(KernelStatus.SCHEDULED, KernelStatus.PREPARING)) + & (KernelRow.status.in_((KernelStatus.SCHEDULED, KernelStatus.PREPARING))) ) # Ensures transition .with_for_update()