Skip to content

Commit

Permalink
fix error and better loggings
Browse files Browse the repository at this point in the history
  • Loading branch information
fregataa committed Nov 22, 2024
1 parent 6b06b00 commit 51232b8
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 13 deletions.
6 changes: 4 additions & 2 deletions src/ai/backend/agent/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ async def _pull(reporter: ProgressReporter, *, img_conf: ImageConfig) -> None:
img_ref, img_conf["registry"], timeout=image_pull_timeout
)
except asyncio.TimeoutError:
log.exception(f"Image pull timeout after {image_pull_timeout} sec")
log.exception(f"Image pull timeout (img:{str(img_ref)},s:{image_pull_timeout})")
await self.agent.produce_event(
ImagePullFailedEvent(
image=str(img_ref),
Expand All @@ -537,7 +537,7 @@ async def _pull(reporter: ProgressReporter, *, img_conf: ImageConfig) -> None:
)
)
except Exception as e:
log.exception(f"Image pull failed (e:{repr(e)})")
log.exception(f"Image pull failed (img:{str(img_ref)},e:{repr(e)})")
await self.agent.produce_event(
ImagePullFailedEvent(
image=str(img_ref),
Expand All @@ -546,13 +546,15 @@ async def _pull(reporter: ProgressReporter, *, img_conf: ImageConfig) -> None:
)
)
else:
log.info(f"Image pull succeeded {str(img_ref)}")
await self.agent.produce_event(
ImagePullFinishedEvent(
image=str(img_ref),
agent_id=self.agent.id,
)
)
else:
log.debug(f"No need to pull image {str(img_ref)}")
await self.agent.produce_event(
ImagePullFinishedEvent(
image=str(img_ref),
Expand Down
20 changes: 10 additions & 10 deletions src/ai/backend/manager/models/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,11 @@ class SessionStatus(enum.StrEnum):
SessionStatus.ERROR,
SessionStatus.CANCELLED,
},
SessionStatus.CREATING: {
SessionStatus.RUNNING,
SessionStatus.ERROR,
SessionStatus.CANCELLED,
},
SessionStatus.RUNNING: {
SessionStatus.RESTARTING,
SessionStatus.RUNNING_DEGRADED,
Expand Down Expand Up @@ -373,12 +378,7 @@ def determine_session_status_by_kernels(kernels: Sequence[KernelRow]) -> Session
return SessionStatus.ERROR
case SessionStatus.PULLING:
match k.status:
case (
KernelStatus.PULLING
| KernelStatus.PREPARING
| KernelStatus.PREPARED
| KernelStatus.RUNNING
):
case KernelStatus.PULLING | KernelStatus.PREPARING | KernelStatus.PREPARED:
continue
case KernelStatus.CANCELLED:
candidate = SessionStatus.CANCELLED
Expand All @@ -403,17 +403,17 @@ def determine_session_status_by_kernels(kernels: Sequence[KernelRow]) -> Session
case KernelStatus.CANCELLED:
candidate = SessionStatus.CANCELLED
case _:
# Set status to ERROR if any kernel is in exceptional state
return SessionStatus.ERROR
case SessionStatus.CANCELLED:
match k.status:
case (
KernelStatus.CANCELLED
| KernelStatus.PENDING
| KernelStatus.SCHEDULED
| KernelStatus.PULLING
| KernelStatus.PREPARING
| KernelStatus.PULLING
| KernelStatus.PREPARED
| KernelStatus.CREATING
):
continue
case _:
Expand All @@ -434,10 +434,10 @@ def determine_session_status_by_kernels(kernels: Sequence[KernelRow]) -> Session
return SessionStatus.ERROR
case SessionStatus.TERMINATED:
match k.status:
case KernelStatus.TERMINATING:
candidate = SessionStatus.TERMINATING
case KernelStatus.TERMINATED:
continue
case KernelStatus.TERMINATING:
candidate = SessionStatus.TERMINATING
case _:
return SessionStatus.ERROR
case SessionStatus.RESTARTING | SessionStatus.RUNNING_DEGRADED:
Expand Down
2 changes: 1 addition & 1 deletion src/ai/backend/manager/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -3280,7 +3280,7 @@ async def _transit(db_session: AsyncSession) -> set[SessionId]:
.where(
(KernelRow.image == image)
& (KernelRow.agent == agent_id)
& (KernelRow.status.in_(KernelStatus.SCHEDULED, KernelStatus.PREPARING))
& (KernelRow.status.in_((KernelStatus.SCHEDULED, KernelStatus.PREPARING)))
)
# Ensures transition
.with_for_update()
Expand Down

0 comments on commit 51232b8

Please sign in to comment.