From 808b99f6061ac37763ad3e3d86c4ecfa1b1edc5d Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Fri, 12 Jul 2024 18:06:03 +0900 Subject: [PATCH] revert context indent change between rpc caller and try-except --- src/ai/backend/manager/registry.py | 35 +++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index 4c7d280de92..6553961efb4 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -1796,10 +1796,39 @@ async def _update_failure() -> None: status_data=err_info, ) ) - await db_sess.execute(query) - await execute_with_retry(_update_failure) - raise + # The agent has already cancelled or issued the destruction lifecycle event + # for this batch of kernels. + for binding in items: + kernel_id = binding.kernel.id + + async def _update_failure() -> None: + async with self.db.begin_session() as db_sess: + now = datetime.now(tzutc()) + query = ( + sa.update(KernelRow) + .where(KernelRow.id == kernel_id) + .values( + status=KernelStatus.ERROR, + status_info=f"other-error ({ex!r})", + status_changed=now, + terminated_at=now, + status_history=sql_json_merge( + KernelRow.status_history, + (), + { + KernelStatus.ERROR.name: ( + now.isoformat() + ), # ["PULLING", "PREPARING"] + }, + ), + status_data=err_info, + ) + ) + await db_sess.execute(query) + + await execute_with_retry(_update_failure) + raise async def create_cluster_ssh_keypair(self) -> ClusterSSHKeyPair: key = rsa.generate_private_key(