From 90665b2fce3d7f08630fedf2402309142900382d Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Thu, 8 Aug 2024 20:49:47 +0200
Subject: [PATCH 01/15] dev

---
 xinference/constants.py                       |   1 +
 xinference/model/video/__init__.py            |  63 +++++++
 xinference/model/video/core.py                | 177 ++++++++++++++++++
 xinference/model/video/diffusers.py           | 100 ++++++++++
 xinference/model/video/model_spec.json        |  11 ++
 .../model/video/model_spec_modelscope.json    |  12 ++
 xinference/model/video/tests/__init__.py      |  13 ++
 .../model/video/tests/test_diffusers_video.py |  37 ++++
 8 files changed, 414 insertions(+)
 create mode 100644 xinference/model/video/__init__.py
 create mode 100644 xinference/model/video/core.py
 create mode 100644 xinference/model/video/diffusers.py
 create mode 100644 xinference/model/video/model_spec.json
 create mode 100644 xinference/model/video/model_spec_modelscope.json
 create mode 100644 xinference/model/video/tests/__init__.py
 create mode 100644 xinference/model/video/tests/test_diffusers_video.py

diff --git a/xinference/constants.py b/xinference/constants.py
index 3efad56ed3..c9ba4e5ddc 100644
--- a/xinference/constants.py
+++ b/xinference/constants.py
@@ -47,6 +47,7 @@ def get_xinference_home() -> str:
 XINFERENCE_MODEL_DIR = os.path.join(XINFERENCE_HOME, "model")
 XINFERENCE_LOG_DIR = os.path.join(XINFERENCE_HOME, "logs")
 XINFERENCE_IMAGE_DIR = os.path.join(XINFERENCE_HOME, "image")
+XINFERENCE_VIDEO_DIR = os.path.join(XINFERENCE_HOME, "video")
 XINFERENCE_AUTH_DIR = os.path.join(XINFERENCE_HOME, "auth")
 XINFERENCE_CSG_ENDPOINT = str(
     os.environ.get(XINFERENCE_ENV_CSG_ENDPOINT, "https://hub-stg.opencsg.com/")
diff --git a/xinference/model/video/__init__.py b/xinference/model/video/__init__.py
new file mode 100644
index 0000000000..45f86f36be
--- /dev/null
+++ b/xinference/model/video/__init__.py
@@ -0,0 +1,63 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import codecs
+import json
+import os
+from itertools import chain
+
+from .core import (
+    BUILTIN_VIDEO_MODELS,
+    VIDEO_MODEL_DESCRIPTIONS,
+    MODEL_NAME_TO_REVISION,
+    MODELSCOPE_VIDEO_MODELS,
+    VideoModelFamilyV1,
+    generate_image_description,
+    get_cache_status,
+    get_image_model_descriptions,
+)
+
+
+_model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
+_model_spec_modelscope_json = os.path.join(
+    os.path.dirname(__file__), "model_spec_modelscope.json"
+)
+BUILTIN_VIDEO_MODELS.update(
+    dict(
+        (spec["model_name"], VideoModelFamilyV1(**spec))
+        for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
+    )
+)
+for model_name, model_spec in BUILTIN_VIDEO_MODELS.items():
+    MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
+
+MODELSCOPE_VIDEO_MODELS.update(
+    dict(
+        (spec["model_name"], VideoModelFamilyV1(**spec))
+        for spec in json.load(
+            codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
+        )
+    )
+)
+for model_name, model_spec in MODELSCOPE_VIDEO_MODELS.items():
+    MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
+
+# register model description
+for model_name, model_spec in chain(
+    MODELSCOPE_VIDEO_MODELS.items(), BUILTIN_VIDEO_MODELS.items()
+):
+    VIDEO_MODEL_DESCRIPTIONS.update(generate_image_description(model_spec))
+
+del _model_spec_json
+del _model_spec_modelscope_json
diff --git a/xinference/model/video/core.py b/xinference/model/video/core.py
new file mode 100644
index 0000000000..8a9f15ef95
--- /dev/null
+++ b/xinference/model/video/core.py
@@ -0,0 +1,177 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from collections import defaultdict
+from typing import Dict, List, Literal, Optional, Tuple
+
+from ...constants import XINFERENCE_CACHE_DIR
+from ..core import CacheableModelSpec, ModelDescription
+from ..utils import valid_model_revision
+from .diffusers import DiffUsersVideoModel
+
+MAX_ATTEMPTS = 3
+
+logger = logging.getLogger(__name__)
+
+MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
+VIDEO_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
+BUILTIN_VIDEO_MODELS: Dict[str, "VideoModelFamilyV1"] = {}
+MODELSCOPE_VIDEO_MODELS: Dict[str, "VideoModelFamilyV1"] = {}
+
+
+def get_image_model_descriptions():
+    import copy
+
+    return copy.deepcopy(VIDEO_MODEL_DESCRIPTIONS)
+
+
+class VideoModelFamilyV1(CacheableModelSpec):
+    model_family: str
+    model_name: str
+    model_id: str
+    model_revision: str
+    model_hub: str = "huggingface"
+    model_ability: Optional[List[str]]
+
+
+class VideoModelDescription(ModelDescription):
+    def __init__(
+        self,
+        address: Optional[str],
+        devices: Optional[List[str]],
+        model_spec: VideoModelFamilyV1,
+        model_path: Optional[str] = None,
+    ):
+        super().__init__(address, devices, model_path=model_path)
+        self._model_spec = model_spec
+
+    def to_dict(self):
+        return {
+            "model_type": "image",
+            "address": self.address,
+            "accelerators": self.devices,
+            "model_name": self._model_spec.model_name,
+            "model_family": self._model_spec.model_family,
+            "model_revision": self._model_spec.model_revision,
+            "model_ability": self._model_spec.model_ability,
+        }
+
+    def to_version_info(self):
+        if self._model_path is None:
+            is_cached = get_cache_status(self._model_spec)
+            file_location = get_cache_dir(self._model_spec)
+        else:
+            is_cached = True
+            file_location = self._model_path
+
+        return [
+            {
+                "model_version": self._model_spec.model_name,
+                "model_file_location": file_location,
+                "cache_status": is_cached,
+            }
+        ]
+
+
+def generate_image_description(
+    image_model: VideoModelFamilyV1,
+) -> Dict[str, List[Dict]]:
+    res = defaultdict(list)
+    res[image_model.model_name].extend(
+        VideoModelDescription(None, None, image_model).to_version_info()
+    )
+    return res
+
+
+def match_diffusion(
+    model_name: str,
+    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+) -> VideoModelFamilyV1:
+    from ..utils import download_from_modelscope
+    from . import BUILTIN_VIDEO_MODELS, MODELSCOPE_VIDEO_MODELS
+
+    if download_hub == "modelscope" and model_name in MODELSCOPE_VIDEO_MODELS:
+        logger.debug(f"Image model {model_name} found in ModelScope.")
+        return MODELSCOPE_VIDEO_MODELS[model_name]
+    elif download_hub == "huggingface" and model_name in BUILTIN_VIDEO_MODELS:
+        logger.debug(f"Image model {model_name} found in Huggingface.")
+        return BUILTIN_VIDEO_MODELS[model_name]
+    elif download_from_modelscope() and model_name in MODELSCOPE_VIDEO_MODELS:
+        logger.debug(f"Image model {model_name} found in ModelScope.")
+        return MODELSCOPE_VIDEO_MODELS[model_name]
+    elif model_name in BUILTIN_VIDEO_MODELS:
+        logger.debug(f"Image model {model_name} found in Huggingface.")
+        return BUILTIN_VIDEO_MODELS[model_name]
+    else:
+        raise ValueError(
+            f"Image model {model_name} not found, available"
+            f"model list: {BUILTIN_VIDEO_MODELS.keys()}"
+        )
+
+
+def cache(model_spec: VideoModelFamilyV1):
+    from ..utils import cache
+
+    return cache(model_spec, VideoModelDescription)
+
+
+def get_cache_dir(model_spec: VideoModelFamilyV1):
+    return os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name))
+
+
+def get_cache_status(
+    model_spec: VideoModelFamilyV1,
+) -> bool:
+    cache_dir = get_cache_dir(model_spec)
+    meta_path = os.path.join(cache_dir, "__valid_download")
+
+    model_name = model_spec.model_name
+    if model_name in BUILTIN_VIDEO_MODELS and model_name in MODELSCOPE_VIDEO_MODELS:
+        hf_spec = BUILTIN_VIDEO_MODELS[model_name]
+        ms_spec = MODELSCOPE_VIDEO_MODELS[model_name]
+
+        return any(
+            [
+                valid_model_revision(meta_path, hf_spec.model_revision),
+                valid_model_revision(meta_path, ms_spec.model_revision),
+            ]
+        )
+    else:  # Usually for UT
+        return valid_model_revision(meta_path, model_spec.model_revision)
+
+
+def create_video_model_instance(
+    subpool_addr: str,
+    devices: List[str],
+    model_uid: str,
+    model_name: str,
+    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    model_path: Optional[str] = None,
+    **kwargs,
+) -> Tuple[DiffUsersVideoModel, VideoModelDescription]:
+    model_spec = match_diffusion(model_name, download_hub)
+    if not model_path:
+        model_path = cache(model_spec)
+
+    model = DiffUsersVideoModel(
+        model_uid,
+        model_path,
+        model_spec,
+        **kwargs,
+    )
+    model_description = VideoModelDescription(
+        subpool_addr, devices, model_spec, model_path=model_path
+    )
+    return model, model_description
diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py
new file mode 100644
index 0000000000..4bf1205aed
--- /dev/null
+++ b/xinference/model/video/diffusers.py
@@ -0,0 +1,100 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import uuid
+import torch
+
+from ...constants import XINFERENCE_VIDEO_DIR
+from ...device_utils import move_model_to_available_device
+
+logger = logging.getLogger(__name__)
+
+
+class DiffUsersVideoModel:
+    def __init__(
+        self,
+        model_uid: str,
+        model_path: str,
+        model_spec: "VideoModelFamilyV1",
+        **kwargs,
+    ):
+        self._model_uid = model_uid
+        self._model_path = model_path
+        self._model_spec = model_spec
+        self._model = None
+        self._kwargs = kwargs
+
+    def load(self):
+        import torch
+
+        torch_dtype = self._kwargs.get("torch_dtype")
+        if sys.platform != "darwin" and torch_dtype is None:
+            # The following params crashes on Mac M2
+            self._kwargs["torch_dtype"] = torch.float16
+            self._kwargs["variant"] = "fp16"
+            self._kwargs["use_safetensors"] = True
+        if isinstance(torch_dtype, str):
+            self._kwargs["torch_dtype"] = getattr(torch, torch_dtype)
+
+        if self._model_spec.model_family == "CogVideoX":
+            from diffusers import CogVideoXPipeline
+
+            self._model = CogVideoXPipeline.from_pretrained(
+                self._model_path, torch_dtype=torch.float16
+            )
+        else:
+            raise Exception(
+                f"Unsupported model family: {self._model_spec.model_family}"
+            )
+
+        if self._kwargs.get("cpu_offload", False):
+            logger.debug("CPU offloading model")
+            self._model.enable_model_cpu_offload()
+        elif not self._kwargs.get("device_map"):
+            logger.debug("Loading model to available device")
+            self._model = move_model_to_available_device(self._model)
+        # Recommended if your computer has < 64 GB of RAM
+        self._model.enable_attention_slicing()
+
+    def text_to_image(
+        self,
+        prompt: str,
+        n: int = 1,
+        **kwargs,
+    ):
+        from diffusers.utils import export_to_video
+
+        logger.debug(
+            "diffusers args: %s",
+            kwargs,
+        )
+        # assert callable(self._model)
+        prompt_embeds, _ = self._model.encode_prompt(
+            prompt=prompt,
+            do_classifier_free_guidance=True,
+            num_videos_per_prompt=n,
+            max_sequence_length=226,
+            device=self._model.device,
+            dtype=torch.float16,
+        )
+        video = self._model(
+            num_inference_steps=50,
+            guidance_scale=6,
+            prompt_embeds=prompt_embeds,
+        ).frames[0]
+        path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".jpg")
+        export_to_video(video, path, fps=8)
diff --git a/xinference/model/video/model_spec.json b/xinference/model/video/model_spec.json
new file mode 100644
index 0000000000..52b748fd6a
--- /dev/null
+++ b/xinference/model/video/model_spec.json
@@ -0,0 +1,11 @@
+[
+  {
+    "model_name": "CogVideoX-2b",
+    "model_family": "CogVideoX",
+    "model_id": "THUDM/CogVideoX-2b",
+    "model_revision": "4bbfb1de622b80bc1b77b6e9aced75f816be0e38",
+    "model_ability": [
+      "text2video"
+    ]
+  }
+]
diff --git a/xinference/model/video/model_spec_modelscope.json b/xinference/model/video/model_spec_modelscope.json
new file mode 100644
index 0000000000..e3cb604921
--- /dev/null
+++ b/xinference/model/video/model_spec_modelscope.json
@@ -0,0 +1,12 @@
+[
+  {
+    "model_name": "CogVideoX-2b",
+    "model_family": "CogVideoX",
+    "model_hub": "modelscope",
+    "model_id": "ZhipuAI/CogVideoX-2b",
+    "model_revision": "master",
+    "model_ability": [
+      "text2video"
+    ]
+  }
+]
diff --git a/xinference/model/video/tests/__init__.py b/xinference/model/video/tests/__init__.py
new file mode 100644
index 0000000000..37f6558d95
--- /dev/null
+++ b/xinference/model/video/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/xinference/model/video/tests/test_diffusers_video.py b/xinference/model/video/tests/test_diffusers_video.py
new file mode 100644
index 0000000000..b99f478d02
--- /dev/null
+++ b/xinference/model/video/tests/test_diffusers_video.py
@@ -0,0 +1,37 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import logging
+import os.path
+import shutil
+from io import BytesIO
+
+from PIL import Image
+
+from ..core import cache
+from ..diffusers import DiffUsersVideoModel
+from .. import BUILTIN_VIDEO_MODELS
+
+
+logger = logging.getLogger(__name__)
+
+
+def test_model():
+    test_model_spec = next(iter(BUILTIN_VIDEO_MODELS.values()))
+    model_path = cache(test_model_spec)
+    model = DiffUsersVideoModel("mock", model_path, test_model_spec)
+    # input is a string
+    input_text = "an apple"
+    model.load()
+    r = model.text_to_image(input_text)

From f24ff2da288ae54791247e8be5fdf3981bdb1465 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Thu, 8 Aug 2024 21:19:27 +0200
Subject: [PATCH 02/15] List video for /v1/model_registrations/video

---
 xinference/api/restful_api.py               | 40 +++++++++++++++++++++
 xinference/client/restful/restful_client.py | 39 ++++++++++++++++++++
 xinference/core/model.py                    | 21 +++++++++++
 xinference/core/supervisor.py               | 37 +++++++++++++++++++
 xinference/core/worker.py                   |  2 ++
 xinference/model/core.py                    | 12 +++++++
 xinference/model/video/diffusers.py         | 16 ++++++---
 xinference/types.py                         |  5 +++
 8 files changed, 167 insertions(+), 5 deletions(-)

diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
index 87e19d98cf..f330f58e19 100644
--- a/xinference/api/restful_api.py
+++ b/xinference/api/restful_api.py
@@ -123,6 +123,14 @@ class TextToImageRequest(BaseModel):
     user: Optional[str] = None
 
 
+class TextToVideoRequest(BaseModel):
+    model: str
+    prompt: Union[str, List[str]] = Field(description="The input to embed.")
+    n: Optional[int] = 1
+    kwargs: Optional[str] = None
+    user: Optional[str] = None
+
+
 class SpeechRequest(BaseModel):
     model: str
     input: str
@@ -1546,6 +1554,38 @@ async def create_flexible_infer(self, request: Request) -> Response:
             await self._report_error_event(model_uid, str(e))
             raise HTTPException(status_code=500, detail=str(e))
 
+    async def create_videos(self, request: Request) -> Response:
+        body = TextToVideoRequest.parse_obj(await request.json())
+        model_uid = body.model
+        try:
+            model = await (await self._get_supervisor_ref()).get_model(model_uid)
+        except ValueError as ve:
+            logger.error(str(ve), exc_info=True)
+            await self._report_error_event(model_uid, str(ve))
+            raise HTTPException(status_code=400, detail=str(ve))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            await self._report_error_event(model_uid, str(e))
+            raise HTTPException(status_code=500, detail=str(e))
+
+        try:
+            kwargs = json.loads(body.kwargs) if body.kwargs else {}
+            video_list = await model.text_to_video(
+                prompt=body.prompt,
+                n=body.n,
+                **kwargs,
+            )
+            return Response(content=video_list, media_type="application/json")
+        except RuntimeError as re:
+            logger.error(re, exc_info=True)
+            await self._report_error_event(model_uid, str(re))
+            self.handle_request_limit_error(re)
+            raise HTTPException(status_code=400, detail=str(re))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            await self._report_error_event(model_uid, str(e))
+            raise HTTPException(status_code=500, detail=str(e))
+
     async def create_chat_completion(self, request: Request) -> Response:
         raw_body = await request.json()
         body = CreateChatCompletion.parse_obj(raw_body)
diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py
index aa0955f75d..c4368368d2 100644
--- a/xinference/client/restful/restful_client.py
+++ b/xinference/client/restful/restful_client.py
@@ -29,6 +29,7 @@
         CompletionChunk,
         Embedding,
         ImageList,
+        VideoList,
         LlamaCppGenerateConfig,
         PytorchGenerateConfig,
     )
@@ -370,6 +371,44 @@ def inpainting(
         return response_data
 
 
+class RESTfulVideoModelHandle(RESTfulModelHandle):
+    def text_to_video(
+        self,
+        prompt: str,
+        n: int = 1,
+        **kwargs,
+    ) -> "VideoList":
+        """
+        Creates an image by the input text.
+
+        Parameters
+        ----------
+        prompt: `str` or `List[str]`
+            The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+        n: `int`, defaults to 1
+            The number of images to generate per prompt. Must be between 1 and 10.
+        Returns
+        -------
+        ImageList
+            A list of image objects.
+        """
+        url = f"{self._base_url}/v1/images/generations"
+        request_body = {
+            "model": self._model_uid,
+            "prompt": prompt,
+            "n": n,
+            "kwargs": json.dumps(kwargs),
+        }
+        response = requests.post(url, json=request_body, headers=self.auth_headers)
+        if response.status_code != 200:
+            raise RuntimeError(
+                f"Failed to create the images, detail: {_get_error_string(response)}"
+            )
+
+        response_data = response.json()
+        return response_data
+
+
 class RESTfulGenerateModelHandle(RESTfulModelHandle):
     def generate(
         self,
diff --git a/xinference/core/model.py b/xinference/core/model.py
index 7fc41b9c53..24cfe3c6e8 100644
--- a/xinference/core/model.py
+++ b/xinference/core/model.py
@@ -774,6 +774,27 @@ async def infer(
             f"Model {self._model.model_spec} is not for flexible infer."
         )
 
+    @log_async(logger=logger)
+    @request_limit
+    async def text_to_video(
+        self,
+        prompt: str,
+        n: int = 1,
+        *args,
+        **kwargs,
+    ):
+        if hasattr(self._model, "text_to_video"):
+            return await self._call_wrapper_json(
+                self._model.text_to_video,
+                prompt,
+                n,
+                *args,
+                **kwargs,
+            )
+        raise AttributeError(
+            f"Model {self._model.model_spec} is not for creating video."
+        )
+
     async def record_metrics(self, name, op, kwargs):
         worker_ref = await self._get_worker_ref()
         await worker_ref.record_metrics(name, op, kwargs)
diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py
index 54e4b65849..272f4c23cb 100644
--- a/xinference/core/supervisor.py
+++ b/xinference/core/supervisor.py
@@ -59,6 +59,7 @@
 
 if TYPE_CHECKING:
     from ..model.audio import AudioModelFamilyV1
+    from ..model.video import VideoModelFamilyV1
     from ..model.embedding import EmbeddingModelSpec
     from ..model.flexible import FlexibleModelSpec
     from ..model.image import ImageModelFamilyV1
@@ -484,6 +485,31 @@ async def _to_audio_model_reg(
         res["model_instance_count"] = instance_cnt
         return res
 
+    async def _to_video_model_reg(
+        self, model_family: "VideoModelFamilyV1", is_builtin: bool
+    ) -> Dict[str, Any]:
+        from ..model.video import get_cache_status
+
+        instance_cnt = await self.get_instance_count(model_family.model_name)
+        version_cnt = await self.get_model_version_count(model_family.model_name)
+
+        if self.is_local_deployment():
+            # TODO: does not work when the supervisor and worker are running on separate nodes.
+            cache_status = get_cache_status(model_family)
+            res = {
+                **model_family.dict(),
+                "cache_status": cache_status,
+                "is_builtin": is_builtin,
+            }
+        else:
+            res = {
+                **model_family.dict(),
+                "is_builtin": is_builtin,
+            }
+        res["model_version_count"] = version_cnt
+        res["model_instance_count"] = instance_cnt
+        return res
+
     async def _to_flexible_model_reg(
         self, model_spec: "FlexibleModelSpec", is_builtin: bool
     ) -> Dict[str, Any]:
@@ -602,6 +628,17 @@ def sort_helper(item):
                         {"model_name": model_spec.model_name, "is_builtin": False}
                     )
 
+            ret.sort(key=sort_helper)
+            return ret
+        elif model_type == "video":
+            from ..model.video import BUILTIN_VIDEO_MODELS
+
+            for model_name, family in BUILTIN_VIDEO_MODELS.items():
+                if detailed:
+                    ret.append(await self._to_video_model_reg(family, is_builtin=True))
+                else:
+                    ret.append({"model_name": model_name, "is_builtin": True})
+
             ret.sort(key=sort_helper)
             return ret
         elif model_type == "rerank":
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
index 9524bd604a..cfffd7fb17 100644
--- a/xinference/core/worker.py
+++ b/xinference/core/worker.py
@@ -735,6 +735,8 @@ async def _get_model_ability(self, model: Any, model_type: str) -> List[str]:
             return ["text_to_image"]
         elif model_type == "audio":
             return ["audio_to_text"]
+        elif model_type == "video":
+            return ["text_to_video"]
         elif model_type == "flexible":
             return ["flexible"]
         else:
diff --git a/xinference/model/core.py b/xinference/model/core.py
index 09cb4104a4..4df04e621f 100644
--- a/xinference/model/core.py
+++ b/xinference/model/core.py
@@ -60,6 +60,7 @@ def create_model_instance(
     **kwargs,
 ) -> Tuple[Any, ModelDescription]:
     from .audio.core import create_audio_model_instance
+    from .video.core import create_video_model_instance
     from .embedding.core import create_embedding_model_instance
     from .flexible.core import create_flexible_model_instance
     from .image.core import create_image_model_instance
@@ -127,6 +128,17 @@ def create_model_instance(
             model_path,
             **kwargs,
         )
+    elif model_type == "video":
+        kwargs.pop("trust_remote_code", None)
+        return create_video_model_instance(
+            subpool_addr,
+            devices,
+            model_uid,
+            model_name,
+            download_hub,
+            model_path,
+            **kwargs,
+        )
     elif model_type == "flexible":
         kwargs.pop("trust_remote_code", None)
         return create_flexible_model_instance(
diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py
index 4bf1205aed..b629e81807 100644
--- a/xinference/model/video/diffusers.py
+++ b/xinference/model/video/diffusers.py
@@ -17,9 +17,11 @@
 import sys
 import uuid
 import torch
+import time
 
 from ...constants import XINFERENCE_VIDEO_DIR
 from ...device_utils import move_model_to_available_device
+from ...types import VideoList
 
 logger = logging.getLogger(__name__)
 
@@ -75,7 +77,7 @@ def text_to_image(
         prompt: str,
         n: int = 1,
         **kwargs,
-    ):
+    ) -> VideoList:
         from diffusers.utils import export_to_video
 
         logger.debug(
@@ -91,10 +93,14 @@ def text_to_image(
             device=self._model.device,
             dtype=torch.float16,
         )
-        video = self._model(
+        output = self._model(
             num_inference_steps=50,
             guidance_scale=6,
             prompt_embeds=prompt_embeds,
-        ).frames[0]
-        path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".jpg")
-        export_to_video(video, path, fps=8)
+        )
+        urls = []
+        for f in output.frames:
+            path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".jpg")
+            p = export_to_video(f, path, fps=8)
+            urls.append(p)
+        return VideoList(created=int(time.time()), data=urls)
diff --git a/xinference/types.py b/xinference/types.py
index e66e90bee1..4e1106ae51 100644
--- a/xinference/types.py
+++ b/xinference/types.py
@@ -52,6 +52,11 @@ class ImageList(TypedDict):
     data: List[Image]
 
 
+class VideoList(TypedDict):
+    created: int
+    data: List[str]
+
+
 class EmbeddingUsage(TypedDict):
     prompt_tokens: int
     total_tokens: int

From 5248b297a884e5b1ff8822c474aaf61e3cef8fa3 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Thu, 8 Aug 2024 21:21:48 +0200
Subject: [PATCH 03/15] Add restful client

---
 xinference/api/restful_api.py               | 12 ++++++++++++
 xinference/client/restful/restful_client.py |  6 +++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
index f330f58e19..31fac8be86 100644
--- a/xinference/api/restful_api.py
+++ b/xinference/api/restful_api.py
@@ -64,6 +64,7 @@
     CreateChatCompletion,
     CreateCompletion,
     ImageList,
+    VideoList,
     PeftModelConfig,
     max_tokens_field,
 )
@@ -520,6 +521,17 @@ async def internal_exception_handler(request: Request, exc: Exception):
                 else None
             ),
         )
+        self._router.add_api_route(
+            "/v1/video/generations",
+            self.create_videos,
+            methods=["POST"],
+            response_model=VideoList,
+            dependencies=(
+                [Security(self._auth_service, scopes=["models:read"])]
+                if self.is_authenticated()
+                else None
+            ),
+        )
         self._router.add_api_route(
             "/v1/chat/completions",
             self.create_chat_completion,
diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py
index c4368368d2..987ddac268 100644
--- a/xinference/client/restful/restful_client.py
+++ b/xinference/client/restful/restful_client.py
@@ -392,7 +392,7 @@ def text_to_video(
         ImageList
             A list of image objects.
         """
-        url = f"{self._base_url}/v1/images/generations"
+        url = f"{self._base_url}/v1/video/generations"
         request_body = {
             "model": self._model_uid,
             "prompt": prompt,
@@ -1054,6 +1054,10 @@ def get_model(self, model_uid: str) -> RESTfulModelHandle:
             return RESTfulAudioModelHandle(
                 model_uid, self.base_url, auth_headers=self._headers
             )
+        elif desc["model_type"] == "video":
+            return RESTfulVideoModelHandle(
+                model_uid, self.base_url, auth_headers=self._headers
+            )
         elif desc["model_type"] == "flexible":
             return RESTfulFlexibleModelHandle(
                 model_uid, self.base_url, auth_headers=self._headers

From bc37a0987c6d0ab3eab6796c03c21421a61c1bcc Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Thu, 8 Aug 2024 21:52:00 +0200
Subject: [PATCH 04/15] Fix

---
 xinference/model/video/__init__.py |  6 +++---
 xinference/model/video/core.py     | 22 +++++++++++-----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/xinference/model/video/__init__.py b/xinference/model/video/__init__.py
index 45f86f36be..68bec15379 100644
--- a/xinference/model/video/__init__.py
+++ b/xinference/model/video/__init__.py
@@ -23,9 +23,9 @@
     MODEL_NAME_TO_REVISION,
     MODELSCOPE_VIDEO_MODELS,
     VideoModelFamilyV1,
-    generate_image_description,
+    generate_video_description,
     get_cache_status,
-    get_image_model_descriptions,
+    get_video_model_descriptions,
 )
 
 
@@ -57,7 +57,7 @@
 for model_name, model_spec in chain(
     MODELSCOPE_VIDEO_MODELS.items(), BUILTIN_VIDEO_MODELS.items()
 ):
-    VIDEO_MODEL_DESCRIPTIONS.update(generate_image_description(model_spec))
+    VIDEO_MODEL_DESCRIPTIONS.update(generate_video_description(model_spec))
 
 del _model_spec_json
 del _model_spec_modelscope_json
diff --git a/xinference/model/video/core.py b/xinference/model/video/core.py
index 8a9f15ef95..f2b0a3cce2 100644
--- a/xinference/model/video/core.py
+++ b/xinference/model/video/core.py
@@ -31,7 +31,7 @@
 MODELSCOPE_VIDEO_MODELS: Dict[str, "VideoModelFamilyV1"] = {}
 
 
-def get_image_model_descriptions():
+def get_video_model_descriptions():
     import copy
 
     return copy.deepcopy(VIDEO_MODEL_DESCRIPTIONS)
@@ -59,7 +59,7 @@ def __init__(
 
     def to_dict(self):
         return {
-            "model_type": "image",
+            "model_type": "video",
             "address": self.address,
             "accelerators": self.devices,
             "model_name": self._model_spec.model_name,
@@ -85,12 +85,12 @@ def to_version_info(self):
         ]
 
 
-def generate_image_description(
-    image_model: VideoModelFamilyV1,
+def generate_video_description(
+    video_model: VideoModelFamilyV1,
 ) -> Dict[str, List[Dict]]:
     res = defaultdict(list)
-    res[image_model.model_name].extend(
-        VideoModelDescription(None, None, image_model).to_version_info()
+    res[video_model.model_name].extend(
+        VideoModelDescription(None, None, video_model).to_version_info()
     )
     return res
 
@@ -103,20 +103,20 @@ def match_diffusion(
     from . import BUILTIN_VIDEO_MODELS, MODELSCOPE_VIDEO_MODELS
 
     if download_hub == "modelscope" and model_name in MODELSCOPE_VIDEO_MODELS:
-        logger.debug(f"Image model {model_name} found in ModelScope.")
+        logger.debug(f"Video model {model_name} found in ModelScope.")
         return MODELSCOPE_VIDEO_MODELS[model_name]
     elif download_hub == "huggingface" and model_name in BUILTIN_VIDEO_MODELS:
-        logger.debug(f"Image model {model_name} found in Huggingface.")
+        logger.debug(f"Video model {model_name} found in Huggingface.")
         return BUILTIN_VIDEO_MODELS[model_name]
     elif download_from_modelscope() and model_name in MODELSCOPE_VIDEO_MODELS:
-        logger.debug(f"Image model {model_name} found in ModelScope.")
+        logger.debug(f"Video model {model_name} found in ModelScope.")
         return MODELSCOPE_VIDEO_MODELS[model_name]
     elif model_name in BUILTIN_VIDEO_MODELS:
-        logger.debug(f"Image model {model_name} found in Huggingface.")
+        logger.debug(f"Video model {model_name} found in Huggingface.")
         return BUILTIN_VIDEO_MODELS[model_name]
     else:
         raise ValueError(
-            f"Image model {model_name} not found, available"
+            f"Video model {model_name} not found, available"
             f"model list: {BUILTIN_VIDEO_MODELS.keys()}"
         )
 

From ed92c9ef35c717c4e2b963a66955769334d6264d Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Thu, 8 Aug 2024 21:52:30 +0200
Subject: [PATCH 05/15] Add running models UI

---
 .../web/ui/src/scenes/launch_model/index.js   |  4 ++++
 .../web/ui/src/scenes/running_models/index.js | 24 +++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/xinference/web/ui/src/scenes/launch_model/index.js b/xinference/web/ui/src/scenes/launch_model/index.js
index 1339e94d4f..55f05747bd 100644
--- a/xinference/web/ui/src/scenes/launch_model/index.js
+++ b/xinference/web/ui/src/scenes/launch_model/index.js
@@ -69,6 +69,7 @@ const LaunchModel = () => {
             <Tab label="Rerank Models" value="/launch_model/rerank" />
             <Tab label="Image Models" value="/launch_model/image" />
             <Tab label="Audio Models" value="/launch_model/audio" />
+            <Tab label="Video Models" value="/launch_model/video" />
             <Tab label="Custom Models" value="/launch_model/custom/llm" />
           </TabList>
         </Box>
@@ -93,6 +94,9 @@ const LaunchModel = () => {
         <TabPanel value="/launch_model/audio" sx={{ padding: 0 }}>
           <LaunchModelComponent modelType={'audio'} />
         </TabPanel>
+        <TabPanel value="/launch_model/video" sx={{ padding: 0 }}>
+          <LaunchModelComponent modelType={'video'} />
+        </TabPanel>
         <TabPanel value="/launch_model/custom/llm" sx={{ padding: 0 }}>
           <LaunchCustom gpuAvailable={gpuAvailable} />
         </TabPanel>
diff --git a/xinference/web/ui/src/scenes/running_models/index.js b/xinference/web/ui/src/scenes/running_models/index.js
index e91858f2fd..9f9486651a 100644
--- a/xinference/web/ui/src/scenes/running_models/index.js
+++ b/xinference/web/ui/src/scenes/running_models/index.js
@@ -21,6 +21,7 @@ const RunningModels = () => {
   const [embeddingModelData, setEmbeddingModelData] = useState([])
   const [imageModelData, setImageModelData] = useState([])
   const [audioModelData, setAudioModelData] = useState([])
+  const [videoModelData, setVideoModelData] = useState([])
   const [rerankModelData, setRerankModelData] = useState([])
   const [flexibleModelData, setFlexibleModelData] = useState([])
   const { isCallingApi, setIsCallingApi } = useContext(ApiContext)
@@ -53,6 +54,9 @@ const RunningModels = () => {
       setAudioModelData([
         { id: 'Loading, do not refresh page...', url: 'IS_LOADING' },
       ])
+      setVideoModelData([
+        { id: 'Loading, do not refresh page...', url: 'IS_LOADING' },
+      ])
       setImageModelData([
         { id: 'Loading, do not refresh page...', url: 'IS_LOADING' },
       ])
@@ -72,6 +76,7 @@ const RunningModels = () => {
           const newEmbeddingModelData = []
           const newImageModelData = []
           const newAudioModelData = []
+          const newVideoModelData = []
           const newRerankModelData = []
           const newFlexibleModelData = []
           response.data.forEach((model) => {
@@ -86,6 +91,8 @@ const RunningModels = () => {
               newEmbeddingModelData.push(newValue)
             } else if (newValue.model_type === 'audio') {
               newAudioModelData.push(newValue)
+            } else if (newValue.model_type === 'video') {
+              newVideoModelData.push(newValue)
             } else if (newValue.model_type === 'image') {
               newImageModelData.push(newValue)
             } else if (newValue.model_type === 'rerank') {
@@ -97,6 +104,7 @@ const RunningModels = () => {
           setLlmData(newLlmData)
           setEmbeddingModelData(newEmbeddingModelData)
           setAudioModelData(newAudioModelData)
+          setVideoModelData(newVideoModelData)
           setImageModelData(newImageModelData)
           setRerankModelData(newRerankModelData)
           setFlexibleModelData(newFlexibleModelData)
@@ -591,6 +599,7 @@ const RunningModels = () => {
     },
   ]
   const audioModelColumns = embeddingModelColumns
+  const videoModelColumns = embeddingModelColumns
   const rerankModelColumns = embeddingModelColumns
   const flexibleModelColumns = embeddingModelColumns
 
@@ -652,6 +661,7 @@ const RunningModels = () => {
             <Tab label="Rerank models" value="/running_models/rerank" />
             <Tab label="Image models" value="/running_models/image" />
             <Tab label="Audio models" value="/running_models/audio" />
+            <Tab label="Video models" value="/running_models/video" />
             <Tab label="Flexible models" value="/running_models/flexible" />
           </TabList>
         </Box>
@@ -725,6 +735,20 @@ const RunningModels = () => {
             />
           </Box>
         </TabPanel>
+        <TabPanel value="/running_models/video" sx={{ padding: 0 }}>
+          <Box sx={{ height: '100%', width: '100%' }}>
+            <DataGrid
+              rows={videoModelData}
+              columns={videoModelColumns}
+              autoHeight={true}
+              sx={dataGridStyle}
+              slots={{
+                noRowsOverlay: noRowsOverlay,
+                noResultsOverlay: noResultsOverlay,
+              }}
+            />
+          </Box>
+        </TabPanel>
         <TabPanel value="/running_models/flexible" sx={{ padding: 0 }}>
           <Box sx={{ height: '100%', width: '100%' }}>
             <DataGrid

From 227fad6e3a4d4cdb61786094fd6717e92bd39f1f Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Thu, 8 Aug 2024 22:05:33 +0200
Subject: [PATCH 06/15] Fix

---
 xinference/model/video/diffusers.py           | 10 +++++--
 .../model/video/tests/test_diffusers_video.py | 29 +++++++++++++++++++
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py
index b629e81807..94dde86aac 100644
--- a/xinference/model/video/diffusers.py
+++ b/xinference/model/video/diffusers.py
@@ -40,6 +40,10 @@ def __init__(
         self._model = None
         self._kwargs = kwargs
 
+    @property
+    def model_spec(self):
+        return self._model_spec
+
     def load(self):
         import torch
 
@@ -72,7 +76,7 @@ def load(self):
         # Recommended if your computer has < 64 GB of RAM
         self._model.enable_attention_slicing()
 
-    def text_to_image(
+    def text_to_video(
         self,
         prompt: str,
         n: int = 1,
@@ -81,7 +85,7 @@ def text_to_image(
         from diffusers.utils import export_to_video
 
         logger.debug(
-            "diffusers args: %s",
+            "diffusers text_to_video args: %s",
             kwargs,
         )
         # assert callable(self._model)
@@ -100,7 +104,7 @@ def text_to_image(
         )
         urls = []
         for f in output.frames:
-            path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".jpg")
+            path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".mp4")
             p = export_to_video(f, path, fps=8)
             urls.append(p)
         return VideoList(created=int(time.time()), data=urls)
diff --git a/xinference/model/video/tests/test_diffusers_video.py b/xinference/model/video/tests/test_diffusers_video.py
index b99f478d02..65821f9046 100644
--- a/xinference/model/video/tests/test_diffusers_video.py
+++ b/xinference/model/video/tests/test_diffusers_video.py
@@ -35,3 +35,32 @@ def test_model():
     input_text = "an apple"
     model.load()
     r = model.text_to_image(input_text)
+    assert r
+
+
+def test_client(setup):
+    endpoint, _ = setup
+    from ....client import Client
+
+    client = Client(endpoint)
+
+    model_uid = client.launch_model(
+        model_uid="my_video_model",
+        model_name="CogVideoX-2b",
+        model_type="video",
+    )
+    model = client.get_model(model_uid)
+    assert model
+
+    r = model.text_to_video(
+        prompt="A panda, dressed in a small, red jacket and a tiny hat, "
+        "sits on a wooden stool in a serene bamboo forest. "
+        "The panda's fluffy paws strum a miniature acoustic guitar, "
+        "producing soft, melodic tunes. Nearby, a few other pandas gather, "
+        "watching curiously and some clapping in rhythm. "
+        "Sunlight filters through the tall bamboo, casting a gentle glow on the scene. "
+        "The panda's face is expressive, showing concentration and joy as it plays. "
+        "The background includes a small, flowing stream and vibrant green foliage, "
+        "enhancing the peaceful and magical atmosphere of this unique musical performance."
+    )
+    print(r)

From ca57593137e9870d7c6ff37c6de16a403a937d21 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Thu, 8 Aug 2024 22:07:36 +0200
Subject: [PATCH 07/15] Fix lint

---
 xinference/api/restful_api.py                        | 2 +-
 xinference/client/restful/restful_client.py          | 2 +-
 xinference/core/supervisor.py                        | 2 +-
 xinference/model/core.py                             | 2 +-
 xinference/model/video/__init__.py                   | 3 +--
 xinference/model/video/diffusers.py                  | 3 ++-
 xinference/model/video/tests/test_diffusers_video.py | 3 +--
 7 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
index 31fac8be86..47b4848c80 100644
--- a/xinference/api/restful_api.py
+++ b/xinference/api/restful_api.py
@@ -64,8 +64,8 @@
     CreateChatCompletion,
     CreateCompletion,
     ImageList,
-    VideoList,
     PeftModelConfig,
+    VideoList,
     max_tokens_field,
 )
 from .oauth2.auth_service import AuthService
diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py
index 987ddac268..ca96d5a785 100644
--- a/xinference/client/restful/restful_client.py
+++ b/xinference/client/restful/restful_client.py
@@ -29,9 +29,9 @@
         CompletionChunk,
         Embedding,
         ImageList,
-        VideoList,
         LlamaCppGenerateConfig,
         PytorchGenerateConfig,
+        VideoList,
     )
 
 
diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py
index 272f4c23cb..2b6f7b9fc5 100644
--- a/xinference/core/supervisor.py
+++ b/xinference/core/supervisor.py
@@ -59,12 +59,12 @@
 
 if TYPE_CHECKING:
     from ..model.audio import AudioModelFamilyV1
-    from ..model.video import VideoModelFamilyV1
     from ..model.embedding import EmbeddingModelSpec
     from ..model.flexible import FlexibleModelSpec
     from ..model.image import ImageModelFamilyV1
     from ..model.llm import LLMFamilyV1
     from ..model.rerank import RerankModelSpec
+    from ..model.video import VideoModelFamilyV1
     from .worker import WorkerActor
 
 
diff --git a/xinference/model/core.py b/xinference/model/core.py
index 4df04e621f..4591d255b0 100644
--- a/xinference/model/core.py
+++ b/xinference/model/core.py
@@ -60,12 +60,12 @@ def create_model_instance(
     **kwargs,
 ) -> Tuple[Any, ModelDescription]:
     from .audio.core import create_audio_model_instance
-    from .video.core import create_video_model_instance
     from .embedding.core import create_embedding_model_instance
     from .flexible.core import create_flexible_model_instance
     from .image.core import create_image_model_instance
     from .llm.core import create_llm_model_instance
     from .rerank.core import create_rerank_model_instance
+    from .video.core import create_video_model_instance
 
     if model_type == "LLM":
         return create_llm_model_instance(
diff --git a/xinference/model/video/__init__.py b/xinference/model/video/__init__.py
index 68bec15379..e1325b0bbb 100644
--- a/xinference/model/video/__init__.py
+++ b/xinference/model/video/__init__.py
@@ -19,16 +19,15 @@
 
 from .core import (
     BUILTIN_VIDEO_MODELS,
-    VIDEO_MODEL_DESCRIPTIONS,
     MODEL_NAME_TO_REVISION,
     MODELSCOPE_VIDEO_MODELS,
+    VIDEO_MODEL_DESCRIPTIONS,
     VideoModelFamilyV1,
     generate_video_description,
     get_cache_status,
     get_video_model_descriptions,
 )
 
-
 _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
 _model_spec_modelscope_json = os.path.join(
     os.path.dirname(__file__), "model_spec_modelscope.json"
diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py
index 94dde86aac..d14a811de6 100644
--- a/xinference/model/video/diffusers.py
+++ b/xinference/model/video/diffusers.py
@@ -15,9 +15,10 @@
 import logging
 import os
 import sys
+import time
 import uuid
+
 import torch
-import time
 
 from ...constants import XINFERENCE_VIDEO_DIR
 from ...device_utils import move_model_to_available_device
diff --git a/xinference/model/video/tests/test_diffusers_video.py b/xinference/model/video/tests/test_diffusers_video.py
index 65821f9046..2982486f99 100644
--- a/xinference/model/video/tests/test_diffusers_video.py
+++ b/xinference/model/video/tests/test_diffusers_video.py
@@ -19,10 +19,9 @@
 
 from PIL import Image
 
+from .. import BUILTIN_VIDEO_MODELS
 from ..core import cache
 from ..diffusers import DiffUsersVideoModel
-from .. import BUILTIN_VIDEO_MODELS
-
 
 logger = logging.getLogger(__name__)
 

From 00aa459ec24433aeceac7b77e02551b9b41e0a9b Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Thu, 8 Aug 2024 22:12:36 +0200
Subject: [PATCH 08/15] Fix lint

---
 xinference/client/restful/restful_client.py          | 10 +++++-----
 xinference/model/video/tests/test_diffusers_video.py |  6 ------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py
index ca96d5a785..24ba0b7038 100644
--- a/xinference/client/restful/restful_client.py
+++ b/xinference/client/restful/restful_client.py
@@ -379,18 +379,18 @@ def text_to_video(
         **kwargs,
     ) -> "VideoList":
         """
-        Creates an image by the input text.
+        Creates a video by the input text.
 
         Parameters
         ----------
         prompt: `str` or `List[str]`
-            The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            The prompt or prompts to guide video generation. If not defined, you need to pass `prompt_embeds`.
         n: `int`, defaults to 1
-            The number of images to generate per prompt. Must be between 1 and 10.
+            The number of videos to generate per prompt. Must be between 1 and 10.
         Returns
         -------
-        ImageList
-            A list of image objects.
+        VideoList
+            A list of video objects.
         """
         url = f"{self._base_url}/v1/video/generations"
         request_body = {
diff --git a/xinference/model/video/tests/test_diffusers_video.py b/xinference/model/video/tests/test_diffusers_video.py
index 2982486f99..b5ca5dcb6f 100644
--- a/xinference/model/video/tests/test_diffusers_video.py
+++ b/xinference/model/video/tests/test_diffusers_video.py
@@ -11,13 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import base64
 import logging
-import os.path
-import shutil
-from io import BytesIO
-
-from PIL import Image
 
 from .. import BUILTIN_VIDEO_MODELS
 from ..core import cache

From db4c9828f620e975d677d61c9cb8f8976f000844 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Thu, 8 Aug 2024 22:15:13 +0200
Subject: [PATCH 09/15] Fix typing

---
 xinference/model/video/diffusers.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py
index d14a811de6..85ab1cde8d 100644
--- a/xinference/model/video/diffusers.py
+++ b/xinference/model/video/diffusers.py
@@ -17,6 +17,7 @@
 import sys
 import time
 import uuid
+from typing import TYPE_CHECKING
 
 import torch
 
@@ -24,6 +25,10 @@
 from ...device_utils import move_model_to_available_device
 from ...types import VideoList
 
+if TYPE_CHECKING:
+    from .core import VideoModelFamilyV1
+
+
 logger = logging.getLogger(__name__)
 
 

From 2308d2345cb91cb31cf23b2bdb2e142dbc629c97 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Thu, 8 Aug 2024 22:23:45 +0200
Subject: [PATCH 10/15] Fix lint

---
 xinference/model/video/core.py      | 1 +
 xinference/model/video/diffusers.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/xinference/model/video/core.py b/xinference/model/video/core.py
index f2b0a3cce2..3b9f96ad9a 100644
--- a/xinference/model/video/core.py
+++ b/xinference/model/video/core.py
@@ -164,6 +164,7 @@ def create_video_model_instance(
     model_spec = match_diffusion(model_name, download_hub)
     if not model_path:
         model_path = cache(model_spec)
+    assert model_path is not None
 
     model = DiffUsersVideoModel(
         model_uid,
diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py
index 85ab1cde8d..930cefa09e 100644
--- a/xinference/model/video/diffusers.py
+++ b/xinference/model/video/diffusers.py
@@ -94,7 +94,7 @@ def text_to_video(
             "diffusers text_to_video args: %s",
             kwargs,
         )
-        # assert callable(self._model)
+        assert self._model is not None
         prompt_embeds, _ = self._model.encode_prompt(
             prompt=prompt,
             do_classifier_free_guidance=True,
@@ -103,6 +103,7 @@ def text_to_video(
             device=self._model.device,
             dtype=torch.float16,
         )
+        assert callable(self._model)
         output = self._model(
             num_inference_steps=50,
             guidance_scale=6,

From f5cbbd6ec7da997fef8a5253ff796ee11ede3a39 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Thu, 8 Aug 2024 22:25:13 +0200
Subject: [PATCH 11/15] Fix

---
 xinference/model/video/tests/test_diffusers_video.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/xinference/model/video/tests/test_diffusers_video.py b/xinference/model/video/tests/test_diffusers_video.py
index b5ca5dcb6f..3676612c05 100644
--- a/xinference/model/video/tests/test_diffusers_video.py
+++ b/xinference/model/video/tests/test_diffusers_video.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 import logging
 
+import pytest
+
 from .. import BUILTIN_VIDEO_MODELS
 from ..core import cache
 from ..diffusers import DiffUsersVideoModel
@@ -20,6 +22,7 @@
 logger = logging.getLogger(__name__)
 
 
+@pytest.mark.skip(reason="Video model requires too many GRAM.")
 def test_model():
     test_model_spec = next(iter(BUILTIN_VIDEO_MODELS.values()))
     model_path = cache(test_model_spec)
@@ -31,6 +34,7 @@ def test_model():
     assert r
 
 
+@pytest.mark.skip(reason="Video model requires too many GRAM.")
 def test_client(setup):
     endpoint, _ = setup
     from ....client import Client
@@ -56,4 +60,4 @@ def test_client(setup):
         "The background includes a small, flowing stream and vibrant green foliage, "
         "enhancing the peaceful and magical atmosphere of this unique musical performance."
     )
-    print(r)
+    assert r

From 6b1a4dd3b620fa53d7ef9c8283785c6f2b72ab5d Mon Sep 17 00:00:00 2001
From: qinxuye <qinxuye@gmail.com>
Date: Fri, 9 Aug 2024 07:10:56 +0000
Subject: [PATCH 12/15] fix

---
 setup.cfg                                     |  4 +
 xinference/client/restful/restful_client.py   |  2 +-
 xinference/deploy/docker/requirements.txt     |  1 +
 xinference/deploy/docker/requirements_cpu.txt |  1 +
 xinference/model/video/diffusers.py           | 87 +++++++++++++++++--
 xinference/types.py                           |  7 +-
 6 files changed, 91 insertions(+), 11 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 05e33f9b5a..8664c59fdd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -95,6 +95,7 @@ all =
     sentence-transformers>=2.7.0
     vllm>=0.2.6 ; sys_platform=='linux'
     diffusers>=0.25.0  # fix conflict with matcha-tts
+    imageio-ffmpeg  # For video
     controlnet_aux
     orjson
     auto-gptq ; sys_platform!='darwin'
@@ -158,6 +159,9 @@ rerank =
 image =
     diffusers>=0.25.0  # fix conflict with matcha-tts
     controlnet_aux
+video =
+    diffusers
+    imageio-ffmpeg
 audio =
     funasr
     omegaconf~=2.3.0
diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py
index 24ba0b7038..c11c30c29f 100644
--- a/xinference/client/restful/restful_client.py
+++ b/xinference/client/restful/restful_client.py
@@ -402,7 +402,7 @@ def text_to_video(
         response = requests.post(url, json=request_body, headers=self.auth_headers)
         if response.status_code != 200:
             raise RuntimeError(
-                f"Failed to create the images, detail: {_get_error_string(response)}"
+                f"Failed to create the video, detail: {_get_error_string(response)}"
             )
 
         response_data = response.json()
diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt
index 66f6d650af..1830a7de25 100644
--- a/xinference/deploy/docker/requirements.txt
+++ b/xinference/deploy/docker/requirements.txt
@@ -60,6 +60,7 @@ onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows'  # Fo
 openai-whisper  # For CosyVoice
 boto3>=1.28.55,<1.28.65 # For tensorizer
 tensorizer~=2.9.0
+imageio-ffmpeg  # For video
 
 # sglang
 outlines>=0.0.44
diff --git a/xinference/deploy/docker/requirements_cpu.txt b/xinference/deploy/docker/requirements_cpu.txt
index a117e0c549..7ae0a2544d 100644
--- a/xinference/deploy/docker/requirements_cpu.txt
+++ b/xinference/deploy/docker/requirements_cpu.txt
@@ -55,3 +55,4 @@ matcha-tts  # For CosyVoice
 onnxruntime-gpu==1.16.0; sys_platform == 'linux'  # For CosyVoice
 onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows'  # For CosyVoice
 openai-whisper  # For CosyVoice
+imageio-ffmpeg  # For video
diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py
index 930cefa09e..0cd951aa62 100644
--- a/xinference/model/video/diffusers.py
+++ b/xinference/model/video/diffusers.py
@@ -12,18 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import base64
 import logging
 import os
 import sys
+import tempfile
 import time
 import uuid
-from typing import TYPE_CHECKING
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from io import BytesIO
+from typing import TYPE_CHECKING, List, Optional, Union
 
+import numpy as np
+import PIL.Image
 import torch
 
 from ...constants import XINFERENCE_VIDEO_DIR
 from ...device_utils import move_model_to_available_device
-from ...types import VideoList
+from ...types import Video, VideoList
 
 if TYPE_CHECKING:
     from .core import VideoModelFamilyV1
@@ -32,6 +39,26 @@
 logger = logging.getLogger(__name__)
 
 
+def export_to_video_imageio(
+    video_frames: Union[List[np.ndarray], List["PIL.Image.Image"]],
+    output_video_path: Optional[str] = None,
+    fps: int = 8,
+) -> str:
+    """
+    Export the video frames to a video file using imageio lib to Avoid "green screen" issue (for example CogVideoX)
+    """
+    import imageio
+
+    if output_video_path is None:
+        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
+    if isinstance(video_frames[0], PIL.Image.Image):
+        video_frames = [np.array(frame) for frame in video_frames]
+    with imageio.get_writer(output_video_path, fps=fps) as writer:
+        for frame in video_frames:
+            writer.append_data(frame)
+    return output_video_path
+
+
 class DiffUsersVideoModel:
     def __init__(
         self,
@@ -66,7 +93,7 @@ def load(self):
             from diffusers import CogVideoXPipeline
 
             self._model = CogVideoXPipeline.from_pretrained(
-                self._model_path, torch_dtype=torch.float16
+                self._model_path, **self._kwargs
             )
         else:
             raise Exception(
@@ -86,32 +113,74 @@ def text_to_video(
         self,
         prompt: str,
         n: int = 1,
+        num_inference_steps: int = 50,
+        guidance_scale: int = 6,
+        response_format: str = "b64_json",
         **kwargs,
     ) -> VideoList:
-        from diffusers.utils import export_to_video
+        import gc
+
+        # cv2 bug will cause the video cannot be normally displayed
+        # thus we use the imageio one
+        # from diffusers.utils import export_to_video
+        from ...device_utils import empty_cache
 
         logger.debug(
             "diffusers text_to_video args: %s",
             kwargs,
         )
         assert self._model is not None
+        if self._kwargs.get("cpu_offload"):
+            # if enabled cpu offload,
+            # the model.device would be CPU
+            device = "cuda"
+        else:
+            device = self._model.device
         prompt_embeds, _ = self._model.encode_prompt(
             prompt=prompt,
             do_classifier_free_guidance=True,
             num_videos_per_prompt=n,
             max_sequence_length=226,
-            device=self._model.device,
+            device=device,
             dtype=torch.float16,
         )
         assert callable(self._model)
         output = self._model(
-            num_inference_steps=50,
-            guidance_scale=6,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
             prompt_embeds=prompt_embeds,
+            **kwargs,
         )
+
+        # clean cache
+        gc.collect()
+        empty_cache()
+
+        os.makedirs(XINFERENCE_VIDEO_DIR, exist_ok=True)
         urls = []
         for f in output.frames:
             path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".mp4")
-            p = export_to_video(f, path, fps=8)
+            p = export_to_video_imageio(f, path, fps=8)
             urls.append(p)
-        return VideoList(created=int(time.time()), data=urls)
+        if response_format == "url":
+            return VideoList(
+                created=int(time.time()),
+                data=[Video(url=url, b64_json=None) for url in urls],
+            )
+        elif response_format == "b64_json":
+
+            def _gen_base64_video(_video_url):
+                try:
+                    with open(_video_url, "rb") as f:
+                        buffered = BytesIO()
+                        buffered.write(f.read())
+                        return base64.b64encode(buffered.getvalue()).decode()
+                finally:
+                    os.remove(_video_url)
+
+            with ThreadPoolExecutor() as executor:
+                results = list(map(partial(executor.submit, _gen_base64_video), urls))  # type: ignore
+                video_list = [Video(url=None, b64_json=s.result()) for s in results]
+            return VideoList(created=int(time.time()), data=video_list)
+        else:
+            raise ValueError(f"Unsupported response format: {response_format}")
diff --git a/xinference/types.py b/xinference/types.py
index 4e1106ae51..3f636d94c3 100644
--- a/xinference/types.py
+++ b/xinference/types.py
@@ -52,9 +52,14 @@ class ImageList(TypedDict):
     data: List[Image]
 
 
+class Video(TypedDict):
+    url: Optional[str]
+    b64_json: Optional[str]
+
+
 class VideoList(TypedDict):
     created: int
-    data: List[str]
+    data: List[Video]
 
 
 class EmbeddingUsage(TypedDict):

From 2878e85d739d10d3f7ff318f6e2206817567b816 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Fri, 9 Aug 2024 10:05:16 +0200
Subject: [PATCH 13/15] Fix

---
 xinference/model/video/diffusers.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py
index 0cd951aa62..920572b61f 100644
--- a/xinference/model/video/diffusers.py
+++ b/xinference/model/video/diffusers.py
@@ -172,9 +172,7 @@ def text_to_video(
             def _gen_base64_video(_video_url):
                 try:
                     with open(_video_url, "rb") as f:
-                        buffered = BytesIO()
-                        buffered.write(f.read())
-                        return base64.b64encode(buffered.getvalue()).decode()
+                        return base64.b64encode(f.read()).decode()
                 finally:
                     os.remove(_video_url)
 

From 8bb906034277e150c1ee75de8f256caa9428fed7 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Fri, 9 Aug 2024 10:10:09 +0200
Subject: [PATCH 14/15] Fix

---
 xinference/model/video/diffusers.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py
index 920572b61f..8786d36c1a 100644
--- a/xinference/model/video/diffusers.py
+++ b/xinference/model/video/diffusers.py
@@ -41,7 +41,7 @@
 
 def export_to_video_imageio(
     video_frames: Union[List[np.ndarray], List["PIL.Image.Image"]],
-    output_video_path: Optional[str] = None,
+    output_video_path: str,
     fps: int = 8,
 ) -> str:
     """
@@ -49,8 +49,6 @@ def export_to_video_imageio(
     """
     import imageio
 
-    if output_video_path is None:
-        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
     if isinstance(video_frames[0], PIL.Image.Image):
         video_frames = [np.array(frame) for frame in video_frames]
     with imageio.get_writer(output_video_path, fps=fps) as writer:

From cd90792bd8c500c4285f6b401490a980a2db611c Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Fri, 9 Aug 2024 10:11:47 +0200
Subject: [PATCH 15/15] Fix lint

---
 xinference/model/video/diffusers.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py
index 8786d36c1a..b9b8569918 100644
--- a/xinference/model/video/diffusers.py
+++ b/xinference/model/video/diffusers.py
@@ -16,13 +16,11 @@
 import logging
 import os
 import sys
-import tempfile
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
-from io import BytesIO
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import TYPE_CHECKING, List, Union
 
 import numpy as np
 import PIL.Image