diff --git a/setup.cfg b/setup.cfg
index 05e33f9b5a..8664c59fdd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -95,6 +95,7 @@ all =
     sentence-transformers>=2.7.0
     vllm>=0.2.6 ; sys_platform=='linux'
     diffusers>=0.25.0  # fix conflict with matcha-tts
+    imageio-ffmpeg  # For video
     controlnet_aux
     orjson
     auto-gptq ; sys_platform!='darwin'
@@ -158,6 +159,9 @@ rerank =
 image =
     diffusers>=0.25.0  # fix conflict with matcha-tts
     controlnet_aux
+video =
+    diffusers
+    imageio-ffmpeg
 audio =
     funasr
     omegaconf~=2.3.0
diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
index 87e19d98cf..47b4848c80 100644
--- a/xinference/api/restful_api.py
+++ b/xinference/api/restful_api.py
@@ -65,6 +65,7 @@
     CreateCompletion,
     ImageList,
     PeftModelConfig,
+    VideoList,
     max_tokens_field,
 )
 from .oauth2.auth_service import AuthService
@@ -123,6 +124,14 @@ class TextToImageRequest(BaseModel):
     user: Optional[str] = None
 
 
+class TextToVideoRequest(BaseModel):
+    model: str
+    prompt: Union[str, List[str]] = Field(description="The input to embed.")
+    n: Optional[int] = 1
+    kwargs: Optional[str] = None
+    user: Optional[str] = None
+
+
 class SpeechRequest(BaseModel):
     model: str
     input: str
@@ -512,6 +521,17 @@ async def internal_exception_handler(request: Request, exc: Exception):
                 else None
             ),
         )
+        self._router.add_api_route(
+            "/v1/video/generations",
+            self.create_videos,
+            methods=["POST"],
+            response_model=VideoList,
+            dependencies=(
+                [Security(self._auth_service, scopes=["models:read"])]
+                if self.is_authenticated()
+                else None
+            ),
+        )
         self._router.add_api_route(
             "/v1/chat/completions",
             self.create_chat_completion,
@@ -1546,6 +1566,38 @@ async def create_flexible_infer(self, request: Request) -> Response:
             await self._report_error_event(model_uid, str(e))
             raise HTTPException(status_code=500, detail=str(e))
 
+    async def create_videos(self, request: Request) -> Response:
+        body = TextToVideoRequest.parse_obj(await request.json())
+        model_uid = body.model
+        try:
+            model = await (await self._get_supervisor_ref()).get_model(model_uid)
+        except ValueError as ve:
+            logger.error(str(ve), exc_info=True)
+            await self._report_error_event(model_uid, str(ve))
+            raise HTTPException(status_code=400, detail=str(ve))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            await self._report_error_event(model_uid, str(e))
+            raise HTTPException(status_code=500, detail=str(e))
+
+        try:
+            kwargs = json.loads(body.kwargs) if body.kwargs else {}
+            video_list = await model.text_to_video(
+                prompt=body.prompt,
+                n=body.n,
+                **kwargs,
+            )
+            return Response(content=video_list, media_type="application/json")
+        except RuntimeError as re:
+            logger.error(re, exc_info=True)
+            await self._report_error_event(model_uid, str(re))
+            self.handle_request_limit_error(re)
+            raise HTTPException(status_code=400, detail=str(re))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            await self._report_error_event(model_uid, str(e))
+            raise HTTPException(status_code=500, detail=str(e))
+
     async def create_chat_completion(self, request: Request) -> Response:
         raw_body = await request.json()
         body = CreateChatCompletion.parse_obj(raw_body)
diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py
index aa0955f75d..c11c30c29f 100644
--- a/xinference/client/restful/restful_client.py
+++ b/xinference/client/restful/restful_client.py
@@ -31,6 +31,7 @@
         ImageList,
         LlamaCppGenerateConfig,
         PytorchGenerateConfig,
+        VideoList,
     )
 
 
@@ -370,6 +371,44 @@ def inpainting(
         return response_data
 
 
+class RESTfulVideoModelHandle(RESTfulModelHandle):
+    def text_to_video(
+        self,
+        prompt: str,
+        n: int = 1,
+        **kwargs,
+    ) -> "VideoList":
+        """
+        Creates a video by the input text.
+
+        Parameters
+        ----------
+        prompt: `str` or `List[str]`
+            The prompt or prompts to guide video generation. If not defined, you need to pass `prompt_embeds`.
+        n: `int`, defaults to 1
+            The number of videos to generate per prompt. Must be between 1 and 10.
+        Returns
+        -------
+        VideoList
+            A list of video objects.
+        """
+        url = f"{self._base_url}/v1/video/generations"
+        request_body = {
+            "model": self._model_uid,
+            "prompt": prompt,
+            "n": n,
+            "kwargs": json.dumps(kwargs),
+        }
+        response = requests.post(url, json=request_body, headers=self.auth_headers)
+        if response.status_code != 200:
+            raise RuntimeError(
+                f"Failed to create the video, detail: {_get_error_string(response)}"
+            )
+
+        response_data = response.json()
+        return response_data
+
+
 class RESTfulGenerateModelHandle(RESTfulModelHandle):
     def generate(
         self,
@@ -1015,6 +1054,10 @@ def get_model(self, model_uid: str) -> RESTfulModelHandle:
             return RESTfulAudioModelHandle(
                 model_uid, self.base_url, auth_headers=self._headers
             )
+        elif desc["model_type"] == "video":
+            return RESTfulVideoModelHandle(
+                model_uid, self.base_url, auth_headers=self._headers
+            )
         elif desc["model_type"] == "flexible":
             return RESTfulFlexibleModelHandle(
                 model_uid, self.base_url, auth_headers=self._headers
diff --git a/xinference/constants.py b/xinference/constants.py
index 3efad56ed3..c9ba4e5ddc 100644
--- a/xinference/constants.py
+++ b/xinference/constants.py
@@ -47,6 +47,7 @@ def get_xinference_home() -> str:
 XINFERENCE_MODEL_DIR = os.path.join(XINFERENCE_HOME, "model")
 XINFERENCE_LOG_DIR = os.path.join(XINFERENCE_HOME, "logs")
 XINFERENCE_IMAGE_DIR = os.path.join(XINFERENCE_HOME, "image")
+XINFERENCE_VIDEO_DIR = os.path.join(XINFERENCE_HOME, "video")
 XINFERENCE_AUTH_DIR = os.path.join(XINFERENCE_HOME, "auth")
 XINFERENCE_CSG_ENDPOINT = str(
     os.environ.get(XINFERENCE_ENV_CSG_ENDPOINT, "https://hub-stg.opencsg.com/")
diff --git a/xinference/core/model.py b/xinference/core/model.py
index 7fc41b9c53..24cfe3c6e8 100644
--- a/xinference/core/model.py
+++ b/xinference/core/model.py
@@ -774,6 +774,27 @@ async def infer(
             f"Model {self._model.model_spec} is not for flexible infer."
         )
 
+    @log_async(logger=logger)
+    @request_limit
+    async def text_to_video(
+        self,
+        prompt: str,
+        n: int = 1,
+        *args,
+        **kwargs,
+    ):
+        if hasattr(self._model, "text_to_video"):
+            return await self._call_wrapper_json(
+                self._model.text_to_video,
+                prompt,
+                n,
+                *args,
+                **kwargs,
+            )
+        raise AttributeError(
+            f"Model {self._model.model_spec} is not for creating video."
+        )
+
     async def record_metrics(self, name, op, kwargs):
         worker_ref = await self._get_worker_ref()
         await worker_ref.record_metrics(name, op, kwargs)
diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py
index 54e4b65849..2b6f7b9fc5 100644
--- a/xinference/core/supervisor.py
+++ b/xinference/core/supervisor.py
@@ -64,6 +64,7 @@
     from ..model.image import ImageModelFamilyV1
     from ..model.llm import LLMFamilyV1
     from ..model.rerank import RerankModelSpec
+    from ..model.video import VideoModelFamilyV1
     from .worker import WorkerActor
 
 
@@ -484,6 +485,31 @@ async def _to_audio_model_reg(
         res["model_instance_count"] = instance_cnt
         return res
 
+    async def _to_video_model_reg(
+        self, model_family: "VideoModelFamilyV1", is_builtin: bool
+    ) -> Dict[str, Any]:
+        from ..model.video import get_cache_status
+
+        instance_cnt = await self.get_instance_count(model_family.model_name)
+        version_cnt = await self.get_model_version_count(model_family.model_name)
+
+        if self.is_local_deployment():
+            # TODO: does not work when the supervisor and worker are running on separate nodes.
+            cache_status = get_cache_status(model_family)
+            res = {
+                **model_family.dict(),
+                "cache_status": cache_status,
+                "is_builtin": is_builtin,
+            }
+        else:
+            res = {
+                **model_family.dict(),
+                "is_builtin": is_builtin,
+            }
+        res["model_version_count"] = version_cnt
+        res["model_instance_count"] = instance_cnt
+        return res
+
     async def _to_flexible_model_reg(
         self, model_spec: "FlexibleModelSpec", is_builtin: bool
     ) -> Dict[str, Any]:
@@ -602,6 +628,17 @@ def sort_helper(item):
                         {"model_name": model_spec.model_name, "is_builtin": False}
                     )
 
+            ret.sort(key=sort_helper)
+            return ret
+        elif model_type == "video":
+            from ..model.video import BUILTIN_VIDEO_MODELS
+
+            for model_name, family in BUILTIN_VIDEO_MODELS.items():
+                if detailed:
+                    ret.append(await self._to_video_model_reg(family, is_builtin=True))
+                else:
+                    ret.append({"model_name": model_name, "is_builtin": True})
+
             ret.sort(key=sort_helper)
             return ret
         elif model_type == "rerank":
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
index 9524bd604a..cfffd7fb17 100644
--- a/xinference/core/worker.py
+++ b/xinference/core/worker.py
@@ -735,6 +735,8 @@ async def _get_model_ability(self, model: Any, model_type: str) -> List[str]:
             return ["text_to_image"]
         elif model_type == "audio":
             return ["audio_to_text"]
+        elif model_type == "video":
+            return ["text_to_video"]
         elif model_type == "flexible":
             return ["flexible"]
         else:
diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt
index 66f6d650af..1830a7de25 100644
--- a/xinference/deploy/docker/requirements.txt
+++ b/xinference/deploy/docker/requirements.txt
@@ -60,6 +60,7 @@ onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows'  # Fo
 openai-whisper  # For CosyVoice
 boto3>=1.28.55,<1.28.65 # For tensorizer
 tensorizer~=2.9.0
+imageio-ffmpeg  # For video
 
 # sglang
 outlines>=0.0.44
diff --git a/xinference/deploy/docker/requirements_cpu.txt b/xinference/deploy/docker/requirements_cpu.txt
index a117e0c549..7ae0a2544d 100644
--- a/xinference/deploy/docker/requirements_cpu.txt
+++ b/xinference/deploy/docker/requirements_cpu.txt
@@ -55,3 +55,4 @@ matcha-tts  # For CosyVoice
 onnxruntime-gpu==1.16.0; sys_platform == 'linux'  # For CosyVoice
 onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows'  # For CosyVoice
 openai-whisper  # For CosyVoice
+imageio-ffmpeg  # For video
diff --git a/xinference/model/core.py b/xinference/model/core.py
index 09cb4104a4..4591d255b0 100644
--- a/xinference/model/core.py
+++ b/xinference/model/core.py
@@ -65,6 +65,7 @@ def create_model_instance(
     from .image.core import create_image_model_instance
     from .llm.core import create_llm_model_instance
     from .rerank.core import create_rerank_model_instance
+    from .video.core import create_video_model_instance
 
     if model_type == "LLM":
         return create_llm_model_instance(
@@ -127,6 +128,17 @@ def create_model_instance(
             model_path,
             **kwargs,
         )
+    elif model_type == "video":
+        kwargs.pop("trust_remote_code", None)
+        return create_video_model_instance(
+            subpool_addr,
+            devices,
+            model_uid,
+            model_name,
+            download_hub,
+            model_path,
+            **kwargs,
+        )
     elif model_type == "flexible":
         kwargs.pop("trust_remote_code", None)
         return create_flexible_model_instance(
diff --git a/xinference/model/video/__init__.py b/xinference/model/video/__init__.py
new file mode 100644
index 0000000000..e1325b0bbb
--- /dev/null
+++ b/xinference/model/video/__init__.py
@@ -0,0 +1,62 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import codecs
+import json
+import os
+from itertools import chain
+
+from .core import (
+    BUILTIN_VIDEO_MODELS,
+    MODEL_NAME_TO_REVISION,
+    MODELSCOPE_VIDEO_MODELS,
+    VIDEO_MODEL_DESCRIPTIONS,
+    VideoModelFamilyV1,
+    generate_video_description,
+    get_cache_status,
+    get_video_model_descriptions,
+)
+
+_model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
+_model_spec_modelscope_json = os.path.join(
+    os.path.dirname(__file__), "model_spec_modelscope.json"
+)
+BUILTIN_VIDEO_MODELS.update(
+    dict(
+        (spec["model_name"], VideoModelFamilyV1(**spec))
+        for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
+    )
+)
+for model_name, model_spec in BUILTIN_VIDEO_MODELS.items():
+    MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
+
+MODELSCOPE_VIDEO_MODELS.update(
+    dict(
+        (spec["model_name"], VideoModelFamilyV1(**spec))
+        for spec in json.load(
+            codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
+        )
+    )
+)
+for model_name, model_spec in MODELSCOPE_VIDEO_MODELS.items():
+    MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
+
+# register model description
+for model_name, model_spec in chain(
+    MODELSCOPE_VIDEO_MODELS.items(), BUILTIN_VIDEO_MODELS.items()
+):
+    VIDEO_MODEL_DESCRIPTIONS.update(generate_video_description(model_spec))
+
+del _model_spec_json
+del _model_spec_modelscope_json
diff --git a/xinference/model/video/core.py b/xinference/model/video/core.py
new file mode 100644
index 0000000000..3b9f96ad9a
--- /dev/null
+++ b/xinference/model/video/core.py
@@ -0,0 +1,178 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from collections import defaultdict
+from typing import Dict, List, Literal, Optional, Tuple
+
+from ...constants import XINFERENCE_CACHE_DIR
+from ..core import CacheableModelSpec, ModelDescription
+from ..utils import valid_model_revision
+from .diffusers import DiffUsersVideoModel
+
+MAX_ATTEMPTS = 3
+
+logger = logging.getLogger(__name__)
+
+MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
+VIDEO_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
+BUILTIN_VIDEO_MODELS: Dict[str, "VideoModelFamilyV1"] = {}
+MODELSCOPE_VIDEO_MODELS: Dict[str, "VideoModelFamilyV1"] = {}
+
+
+def get_video_model_descriptions():
+    import copy
+
+    return copy.deepcopy(VIDEO_MODEL_DESCRIPTIONS)
+
+
+class VideoModelFamilyV1(CacheableModelSpec):
+    model_family: str
+    model_name: str
+    model_id: str
+    model_revision: str
+    model_hub: str = "huggingface"
+    model_ability: Optional[List[str]]
+
+
+class VideoModelDescription(ModelDescription):
+    def __init__(
+        self,
+        address: Optional[str],
+        devices: Optional[List[str]],
+        model_spec: VideoModelFamilyV1,
+        model_path: Optional[str] = None,
+    ):
+        super().__init__(address, devices, model_path=model_path)
+        self._model_spec = model_spec
+
+    def to_dict(self):
+        return {
+            "model_type": "video",
+            "address": self.address,
+            "accelerators": self.devices,
+            "model_name": self._model_spec.model_name,
+            "model_family": self._model_spec.model_family,
+            "model_revision": self._model_spec.model_revision,
+            "model_ability": self._model_spec.model_ability,
+        }
+
+    def to_version_info(self):
+        if self._model_path is None:
+            is_cached = get_cache_status(self._model_spec)
+            file_location = get_cache_dir(self._model_spec)
+        else:
+            is_cached = True
+            file_location = self._model_path
+
+        return [
+            {
+                "model_version": self._model_spec.model_name,
+                "model_file_location": file_location,
+                "cache_status": is_cached,
+            }
+        ]
+
+
+def generate_video_description(
+    video_model: VideoModelFamilyV1,
+) -> Dict[str, List[Dict]]:
+    res = defaultdict(list)
+    res[video_model.model_name].extend(
+        VideoModelDescription(None, None, video_model).to_version_info()
+    )
+    return res
+
+
+def match_diffusion(
+    model_name: str,
+    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+) -> VideoModelFamilyV1:
+    from ..utils import download_from_modelscope
+    from . import BUILTIN_VIDEO_MODELS, MODELSCOPE_VIDEO_MODELS
+
+    if download_hub == "modelscope" and model_name in MODELSCOPE_VIDEO_MODELS:
+        logger.debug(f"Video model {model_name} found in ModelScope.")
+        return MODELSCOPE_VIDEO_MODELS[model_name]
+    elif download_hub == "huggingface" and model_name in BUILTIN_VIDEO_MODELS:
+        logger.debug(f"Video model {model_name} found in Huggingface.")
+        return BUILTIN_VIDEO_MODELS[model_name]
+    elif download_from_modelscope() and model_name in MODELSCOPE_VIDEO_MODELS:
+        logger.debug(f"Video model {model_name} found in ModelScope.")
+        return MODELSCOPE_VIDEO_MODELS[model_name]
+    elif model_name in BUILTIN_VIDEO_MODELS:
+        logger.debug(f"Video model {model_name} found in Huggingface.")
+        return BUILTIN_VIDEO_MODELS[model_name]
+    else:
+        raise ValueError(
+            f"Video model {model_name} not found, available"
+            f"model list: {BUILTIN_VIDEO_MODELS.keys()}"
+        )
+
+
+def cache(model_spec: VideoModelFamilyV1):
+    from ..utils import cache
+
+    return cache(model_spec, VideoModelDescription)
+
+
+def get_cache_dir(model_spec: VideoModelFamilyV1):
+    return os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name))
+
+
+def get_cache_status(
+    model_spec: VideoModelFamilyV1,
+) -> bool:
+    cache_dir = get_cache_dir(model_spec)
+    meta_path = os.path.join(cache_dir, "__valid_download")
+
+    model_name = model_spec.model_name
+    if model_name in BUILTIN_VIDEO_MODELS and model_name in MODELSCOPE_VIDEO_MODELS:
+        hf_spec = BUILTIN_VIDEO_MODELS[model_name]
+        ms_spec = MODELSCOPE_VIDEO_MODELS[model_name]
+
+        return any(
+            [
+                valid_model_revision(meta_path, hf_spec.model_revision),
+                valid_model_revision(meta_path, ms_spec.model_revision),
+            ]
+        )
+    else:  # Usually for UT
+        return valid_model_revision(meta_path, model_spec.model_revision)
+
+
+def create_video_model_instance(
+    subpool_addr: str,
+    devices: List[str],
+    model_uid: str,
+    model_name: str,
+    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    model_path: Optional[str] = None,
+    **kwargs,
+) -> Tuple[DiffUsersVideoModel, VideoModelDescription]:
+    model_spec = match_diffusion(model_name, download_hub)
+    if not model_path:
+        model_path = cache(model_spec)
+    assert model_path is not None
+
+    model = DiffUsersVideoModel(
+        model_uid,
+        model_path,
+        model_spec,
+        **kwargs,
+    )
+    model_description = VideoModelDescription(
+        subpool_addr, devices, model_spec, model_path=model_path
+    )
+    return model, model_description
diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py
new file mode 100644
index 0000000000..b9b8569918
--- /dev/null
+++ b/xinference/model/video/diffusers.py
@@ -0,0 +1,180 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import logging
+import os
+import sys
+import time
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from typing import TYPE_CHECKING, List, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from ...constants import XINFERENCE_VIDEO_DIR
+from ...device_utils import move_model_to_available_device
+from ...types import Video, VideoList
+
+if TYPE_CHECKING:
+    from .core import VideoModelFamilyV1
+
+
+logger = logging.getLogger(__name__)
+
+
+def export_to_video_imageio(
+    video_frames: Union[List[np.ndarray], List["PIL.Image.Image"]],
+    output_video_path: str,
+    fps: int = 8,
+) -> str:
+    """
+    Export the video frames to a video file using imageio lib to Avoid "green screen" issue (for example CogVideoX)
+    """
+    import imageio
+
+    if isinstance(video_frames[0], PIL.Image.Image):
+        video_frames = [np.array(frame) for frame in video_frames]
+    with imageio.get_writer(output_video_path, fps=fps) as writer:
+        for frame in video_frames:
+            writer.append_data(frame)
+    return output_video_path
+
+
+class DiffUsersVideoModel:
+    def __init__(
+        self,
+        model_uid: str,
+        model_path: str,
+        model_spec: "VideoModelFamilyV1",
+        **kwargs,
+    ):
+        self._model_uid = model_uid
+        self._model_path = model_path
+        self._model_spec = model_spec
+        self._model = None
+        self._kwargs = kwargs
+
+    @property
+    def model_spec(self):
+        return self._model_spec
+
+    def load(self):
+        import torch
+
+        torch_dtype = self._kwargs.get("torch_dtype")
+        if sys.platform != "darwin" and torch_dtype is None:
+            # The following params crashes on Mac M2
+            self._kwargs["torch_dtype"] = torch.float16
+            self._kwargs["variant"] = "fp16"
+            self._kwargs["use_safetensors"] = True
+        if isinstance(torch_dtype, str):
+            self._kwargs["torch_dtype"] = getattr(torch, torch_dtype)
+
+        if self._model_spec.model_family == "CogVideoX":
+            from diffusers import CogVideoXPipeline
+
+            self._model = CogVideoXPipeline.from_pretrained(
+                self._model_path, **self._kwargs
+            )
+        else:
+            raise Exception(
+                f"Unsupported model family: {self._model_spec.model_family}"
+            )
+
+        if self._kwargs.get("cpu_offload", False):
+            logger.debug("CPU offloading model")
+            self._model.enable_model_cpu_offload()
+        elif not self._kwargs.get("device_map"):
+            logger.debug("Loading model to available device")
+            self._model = move_model_to_available_device(self._model)
+        # Recommended if your computer has < 64 GB of RAM
+        self._model.enable_attention_slicing()
+
+    def text_to_video(
+        self,
+        prompt: str,
+        n: int = 1,
+        num_inference_steps: int = 50,
+        guidance_scale: int = 6,
+        response_format: str = "b64_json",
+        **kwargs,
+    ) -> VideoList:
+        import gc
+
+        # cv2 bug will cause the video cannot be normally displayed
+        # thus we use the imageio one
+        # from diffusers.utils import export_to_video
+        from ...device_utils import empty_cache
+
+        logger.debug(
+            "diffusers text_to_video args: %s",
+            kwargs,
+        )
+        assert self._model is not None
+        if self._kwargs.get("cpu_offload"):
+            # if enabled cpu offload,
+            # the model.device would be CPU
+            device = "cuda"
+        else:
+            device = self._model.device
+        prompt_embeds, _ = self._model.encode_prompt(
+            prompt=prompt,
+            do_classifier_free_guidance=True,
+            num_videos_per_prompt=n,
+            max_sequence_length=226,
+            device=device,
+            dtype=torch.float16,
+        )
+        assert callable(self._model)
+        output = self._model(
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            prompt_embeds=prompt_embeds,
+            **kwargs,
+        )
+
+        # clean cache
+        gc.collect()
+        empty_cache()
+
+        os.makedirs(XINFERENCE_VIDEO_DIR, exist_ok=True)
+        urls = []
+        for f in output.frames:
+            path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".mp4")
+            p = export_to_video_imageio(f, path, fps=8)
+            urls.append(p)
+        if response_format == "url":
+            return VideoList(
+                created=int(time.time()),
+                data=[Video(url=url, b64_json=None) for url in urls],
+            )
+        elif response_format == "b64_json":
+
+            def _gen_base64_video(_video_url):
+                try:
+                    with open(_video_url, "rb") as f:
+                        return base64.b64encode(f.read()).decode()
+                finally:
+                    os.remove(_video_url)
+
+            with ThreadPoolExecutor() as executor:
+                results = list(map(partial(executor.submit, _gen_base64_video), urls))  # type: ignore
+                video_list = [Video(url=None, b64_json=s.result()) for s in results]
+            return VideoList(created=int(time.time()), data=video_list)
+        else:
+            raise ValueError(f"Unsupported response format: {response_format}")
diff --git a/xinference/model/video/model_spec.json b/xinference/model/video/model_spec.json
new file mode 100644
index 0000000000..52b748fd6a
--- /dev/null
+++ b/xinference/model/video/model_spec.json
@@ -0,0 +1,11 @@
+[
+  {
+    "model_name": "CogVideoX-2b",
+    "model_family": "CogVideoX",
+    "model_id": "THUDM/CogVideoX-2b",
+    "model_revision": "4bbfb1de622b80bc1b77b6e9aced75f816be0e38",
+    "model_ability": [
+      "text2video"
+    ]
+  }
+]
diff --git a/xinference/model/video/model_spec_modelscope.json b/xinference/model/video/model_spec_modelscope.json
new file mode 100644
index 0000000000..e3cb604921
--- /dev/null
+++ b/xinference/model/video/model_spec_modelscope.json
@@ -0,0 +1,12 @@
+[
+  {
+    "model_name": "CogVideoX-2b",
+    "model_family": "CogVideoX",
+    "model_hub": "modelscope",
+    "model_id": "ZhipuAI/CogVideoX-2b",
+    "model_revision": "master",
+    "model_ability": [
+      "text2video"
+    ]
+  }
+]
diff --git a/xinference/model/video/tests/__init__.py b/xinference/model/video/tests/__init__.py
new file mode 100644
index 0000000000..37f6558d95
--- /dev/null
+++ b/xinference/model/video/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/xinference/model/video/tests/test_diffusers_video.py b/xinference/model/video/tests/test_diffusers_video.py
new file mode 100644
index 0000000000..3676612c05
--- /dev/null
+++ b/xinference/model/video/tests/test_diffusers_video.py
@@ -0,0 +1,63 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+import pytest
+
+from .. import BUILTIN_VIDEO_MODELS
+from ..core import cache
+from ..diffusers import DiffUsersVideoModel
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.mark.skip(reason="Video model requires too many GRAM.")
+def test_model():
+    test_model_spec = next(iter(BUILTIN_VIDEO_MODELS.values()))
+    model_path = cache(test_model_spec)
+    model = DiffUsersVideoModel("mock", model_path, test_model_spec)
+    # input is a string
+    input_text = "an apple"
+    model.load()
+    r = model.text_to_image(input_text)
+    assert r
+
+
+@pytest.mark.skip(reason="Video model requires too many GRAM.")
+def test_client(setup):
+    endpoint, _ = setup
+    from ....client import Client
+
+    client = Client(endpoint)
+
+    model_uid = client.launch_model(
+        model_uid="my_video_model",
+        model_name="CogVideoX-2b",
+        model_type="video",
+    )
+    model = client.get_model(model_uid)
+    assert model
+
+    r = model.text_to_video(
+        prompt="A panda, dressed in a small, red jacket and a tiny hat, "
+        "sits on a wooden stool in a serene bamboo forest. "
+        "The panda's fluffy paws strum a miniature acoustic guitar, "
+        "producing soft, melodic tunes. Nearby, a few other pandas gather, "
+        "watching curiously and some clapping in rhythm. "
+        "Sunlight filters through the tall bamboo, casting a gentle glow on the scene. "
+        "The panda's face is expressive, showing concentration and joy as it plays. "
+        "The background includes a small, flowing stream and vibrant green foliage, "
+        "enhancing the peaceful and magical atmosphere of this unique musical performance."
+    )
+    assert r
diff --git a/xinference/types.py b/xinference/types.py
index e66e90bee1..3f636d94c3 100644
--- a/xinference/types.py
+++ b/xinference/types.py
@@ -52,6 +52,16 @@ class ImageList(TypedDict):
     data: List[Image]
 
 
+class Video(TypedDict):
+    url: Optional[str]
+    b64_json: Optional[str]
+
+
+class VideoList(TypedDict):
+    created: int
+    data: List[Video]
+
+
 class EmbeddingUsage(TypedDict):
     prompt_tokens: int
     total_tokens: int
diff --git a/xinference/web/ui/src/scenes/launch_model/index.js b/xinference/web/ui/src/scenes/launch_model/index.js
index 1339e94d4f..55f05747bd 100644
--- a/xinference/web/ui/src/scenes/launch_model/index.js
+++ b/xinference/web/ui/src/scenes/launch_model/index.js
@@ -69,6 +69,7 @@ const LaunchModel = () => {
             <Tab label="Rerank Models" value="/launch_model/rerank" />
             <Tab label="Image Models" value="/launch_model/image" />
             <Tab label="Audio Models" value="/launch_model/audio" />
+            <Tab label="Video Models" value="/launch_model/video" />
             <Tab label="Custom Models" value="/launch_model/custom/llm" />
           </TabList>
         </Box>
@@ -93,6 +94,9 @@ const LaunchModel = () => {
         <TabPanel value="/launch_model/audio" sx={{ padding: 0 }}>
           <LaunchModelComponent modelType={'audio'} />
         </TabPanel>
+        <TabPanel value="/launch_model/video" sx={{ padding: 0 }}>
+          <LaunchModelComponent modelType={'video'} />
+        </TabPanel>
         <TabPanel value="/launch_model/custom/llm" sx={{ padding: 0 }}>
           <LaunchCustom gpuAvailable={gpuAvailable} />
         </TabPanel>
diff --git a/xinference/web/ui/src/scenes/running_models/index.js b/xinference/web/ui/src/scenes/running_models/index.js
index e91858f2fd..9f9486651a 100644
--- a/xinference/web/ui/src/scenes/running_models/index.js
+++ b/xinference/web/ui/src/scenes/running_models/index.js
@@ -21,6 +21,7 @@ const RunningModels = () => {
   const [embeddingModelData, setEmbeddingModelData] = useState([])
   const [imageModelData, setImageModelData] = useState([])
   const [audioModelData, setAudioModelData] = useState([])
+  const [videoModelData, setVideoModelData] = useState([])
   const [rerankModelData, setRerankModelData] = useState([])
   const [flexibleModelData, setFlexibleModelData] = useState([])
   const { isCallingApi, setIsCallingApi } = useContext(ApiContext)
@@ -53,6 +54,9 @@ const RunningModels = () => {
       setAudioModelData([
         { id: 'Loading, do not refresh page...', url: 'IS_LOADING' },
       ])
+      setVideoModelData([
+        { id: 'Loading, do not refresh page...', url: 'IS_LOADING' },
+      ])
       setImageModelData([
         { id: 'Loading, do not refresh page...', url: 'IS_LOADING' },
       ])
@@ -72,6 +76,7 @@ const RunningModels = () => {
           const newEmbeddingModelData = []
           const newImageModelData = []
           const newAudioModelData = []
+          const newVideoModelData = []
           const newRerankModelData = []
           const newFlexibleModelData = []
           response.data.forEach((model) => {
@@ -86,6 +91,8 @@ const RunningModels = () => {
               newEmbeddingModelData.push(newValue)
             } else if (newValue.model_type === 'audio') {
               newAudioModelData.push(newValue)
+            } else if (newValue.model_type === 'video') {
+              newVideoModelData.push(newValue)
             } else if (newValue.model_type === 'image') {
               newImageModelData.push(newValue)
             } else if (newValue.model_type === 'rerank') {
@@ -97,6 +104,7 @@ const RunningModels = () => {
           setLlmData(newLlmData)
           setEmbeddingModelData(newEmbeddingModelData)
           setAudioModelData(newAudioModelData)
+          setVideoModelData(newVideoModelData)
           setImageModelData(newImageModelData)
           setRerankModelData(newRerankModelData)
           setFlexibleModelData(newFlexibleModelData)
@@ -591,6 +599,7 @@ const RunningModels = () => {
     },
   ]
   const audioModelColumns = embeddingModelColumns
+  const videoModelColumns = embeddingModelColumns
   const rerankModelColumns = embeddingModelColumns
   const flexibleModelColumns = embeddingModelColumns
 
@@ -652,6 +661,7 @@ const RunningModels = () => {
             <Tab label="Rerank models" value="/running_models/rerank" />
             <Tab label="Image models" value="/running_models/image" />
             <Tab label="Audio models" value="/running_models/audio" />
+            <Tab label="Video models" value="/running_models/video" />
             <Tab label="Flexible models" value="/running_models/flexible" />
           </TabList>
         </Box>
@@ -725,6 +735,20 @@ const RunningModels = () => {
             />
           </Box>
         </TabPanel>
+        <TabPanel value="/running_models/video" sx={{ padding: 0 }}>
+          <Box sx={{ height: '100%', width: '100%' }}>
+            <DataGrid
+              rows={videoModelData}
+              columns={videoModelColumns}
+              autoHeight={true}
+              sx={dataGridStyle}
+              slots={{
+                noRowsOverlay: noRowsOverlay,
+                noResultsOverlay: noResultsOverlay,
+              }}
+            />
+          </Box>
+        </TabPanel>
         <TabPanel value="/running_models/flexible" sx={{ padding: 0 }}>
           <Box sx={{ height: '100%', width: '100%' }}>
             <DataGrid