xorbitsai · ChengjieLi28 · Aug 9, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
diff --git a/xinference/core/model.py b/xinference/core/model.py
@@ -145,6 +145,9 @@ async def __pre_destroy__(self):
                     f"Destroy scheduler actor failed, address: {self.address}, error: {e}"
                 )
 
+        if hasattr(self._model, "stop") and callable(self._model.stop):
+            self._model.stop()
+
         if (
             isinstance(self._model, (LLMPytorchModel, LLMVLLMModel))
             and self._model.model_spec.model_format == "pytorch"

diff --git a/xinference/core/worker.py b/xinference/core/worker.py
@@ -158,6 +158,8 @@ async def recover_sub_pool(self, address):
                             except Exception as e:
                                 # Report callback error can be log and ignore, should not interrupt the Process
                                 logger.error("report_event error: %s" % (e))
+                            finally:
+                                del event_model_uid
 
                             self._model_uid_to_recover_count[model_uid] = (
                                 recover_count - 1

diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import asyncio
 import json
 import logging
 import multiprocessing
+import os
 import time
 import uuid
 from typing import (
@@ -240,6 +242,42 @@ def load(self):
         )
         self._engine = AsyncLLMEngine.from_engine_args(engine_args)
 
+        self._check_health_task = None
+        if hasattr(self._engine, "check_health"):
+            # vLLM introduced `check_health` since v0.4.1
+            self._check_health_task = asyncio.create_task(self._check_healthy())
+
+    def stop(self):
+        # though the vLLM engine will shutdown when deleted,
+        # but some issue e.g. GH#1682 reported
+        # when deleting, the engine exists still
+        logger.info("Stopping vLLM engine")
+        if self._check_health_task:
+            self._check_health_task.cancel()
+        if model_executor := getattr(self._engine, "model_executor", None):
+            model_executor.shutdown()
+        self._engine = None
+
+    async def _check_healthy(self, interval: int = 30):
+        from vllm.engine.async_llm_engine import AsyncEngineDeadError
+
+        logger.debug("Begin to check health of vLLM")
+
+        while self._engine is not None:
+            try:
+                await self._engine.check_health()
+            except (AsyncEngineDeadError, RuntimeError):
+                logger.info("Detecting vLLM is not health, prepare to quit the process")
+                try:
+                    self.stop()
+                except:
+                    # ignore error when stop
+                    pass
+                # Just kill the process and let xinference auto-recover the model
+                os._exit(1)
+            else:
+                await asyncio.sleep(interval)
+
     def _sanitize_model_config(
         self, model_config: Optional[VLLMModelConfig]
     ) -> VLLMModelConfig: