From 90665b2fce3d7f08630fedf2402309142900382d Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Thu, 8 Aug 2024 20:49:47 +0200 Subject: [PATCH 01/15] dev --- xinference/constants.py | 1 + xinference/model/video/__init__.py | 63 +++++++ xinference/model/video/core.py | 177 ++++++++++++++++++ xinference/model/video/diffusers.py | 100 ++++++++++ xinference/model/video/model_spec.json | 11 ++ .../model/video/model_spec_modelscope.json | 12 ++ xinference/model/video/tests/__init__.py | 13 ++ .../model/video/tests/test_diffusers_video.py | 37 ++++ 8 files changed, 414 insertions(+) create mode 100644 xinference/model/video/__init__.py create mode 100644 xinference/model/video/core.py create mode 100644 xinference/model/video/diffusers.py create mode 100644 xinference/model/video/model_spec.json create mode 100644 xinference/model/video/model_spec_modelscope.json create mode 100644 xinference/model/video/tests/__init__.py create mode 100644 xinference/model/video/tests/test_diffusers_video.py diff --git a/xinference/constants.py b/xinference/constants.py index 3efad56ed3..c9ba4e5ddc 100644 --- a/xinference/constants.py +++ b/xinference/constants.py @@ -47,6 +47,7 @@ def get_xinference_home() -> str: XINFERENCE_MODEL_DIR = os.path.join(XINFERENCE_HOME, "model") XINFERENCE_LOG_DIR = os.path.join(XINFERENCE_HOME, "logs") XINFERENCE_IMAGE_DIR = os.path.join(XINFERENCE_HOME, "image") +XINFERENCE_VIDEO_DIR = os.path.join(XINFERENCE_HOME, "video") XINFERENCE_AUTH_DIR = os.path.join(XINFERENCE_HOME, "auth") XINFERENCE_CSG_ENDPOINT = str( os.environ.get(XINFERENCE_ENV_CSG_ENDPOINT, "https://hub-stg.opencsg.com/") diff --git a/xinference/model/video/__init__.py b/xinference/model/video/__init__.py new file mode 100644 index 0000000000..45f86f36be --- /dev/null +++ b/xinference/model/video/__init__.py @@ -0,0 +1,63 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import codecs +import json +import os +from itertools import chain + +from .core import ( + BUILTIN_VIDEO_MODELS, + VIDEO_MODEL_DESCRIPTIONS, + MODEL_NAME_TO_REVISION, + MODELSCOPE_VIDEO_MODELS, + VideoModelFamilyV1, + generate_image_description, + get_cache_status, + get_image_model_descriptions, +) + + +_model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json") +_model_spec_modelscope_json = os.path.join( + os.path.dirname(__file__), "model_spec_modelscope.json" +) +BUILTIN_VIDEO_MODELS.update( + dict( + (spec["model_name"], VideoModelFamilyV1(**spec)) + for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8")) + ) +) +for model_name, model_spec in BUILTIN_VIDEO_MODELS.items(): + MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision) + +MODELSCOPE_VIDEO_MODELS.update( + dict( + (spec["model_name"], VideoModelFamilyV1(**spec)) + for spec in json.load( + codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8") + ) + ) +) +for model_name, model_spec in MODELSCOPE_VIDEO_MODELS.items(): + MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision) + +# register model description +for model_name, model_spec in chain( + MODELSCOPE_VIDEO_MODELS.items(), BUILTIN_VIDEO_MODELS.items() +): + VIDEO_MODEL_DESCRIPTIONS.update(generate_image_description(model_spec)) + +del _model_spec_json +del _model_spec_modelscope_json diff --git a/xinference/model/video/core.py b/xinference/model/video/core.py new file mode 100644 index 0000000000..8a9f15ef95 --- /dev/null +++ b/xinference/model/video/core.py @@ -0,0 +1,177 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os +from collections import defaultdict +from typing import Dict, List, Literal, Optional, Tuple + +from ...constants import XINFERENCE_CACHE_DIR +from ..core import CacheableModelSpec, ModelDescription +from ..utils import valid_model_revision +from .diffusers import DiffUsersVideoModel + +MAX_ATTEMPTS = 3 + +logger = logging.getLogger(__name__) + +MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list) +VIDEO_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list) +BUILTIN_VIDEO_MODELS: Dict[str, "VideoModelFamilyV1"] = {} +MODELSCOPE_VIDEO_MODELS: Dict[str, "VideoModelFamilyV1"] = {} + + +def get_image_model_descriptions(): + import copy + + return copy.deepcopy(VIDEO_MODEL_DESCRIPTIONS) + + +class VideoModelFamilyV1(CacheableModelSpec): + model_family: str + model_name: str + model_id: str + model_revision: str + model_hub: str = "huggingface" + model_ability: Optional[List[str]] + + +class VideoModelDescription(ModelDescription): + def __init__( + self, + address: Optional[str], + devices: Optional[List[str]], + model_spec: VideoModelFamilyV1, + model_path: Optional[str] = None, + ): + super().__init__(address, devices, model_path=model_path) + self._model_spec = model_spec + + def to_dict(self): + return { + "model_type": "image", + "address": self.address, + "accelerators": self.devices, + "model_name": self._model_spec.model_name, + "model_family": self._model_spec.model_family, + "model_revision": self._model_spec.model_revision, + "model_ability": self._model_spec.model_ability, + } + + def to_version_info(self): + if self._model_path is None: + is_cached = get_cache_status(self._model_spec) + file_location = get_cache_dir(self._model_spec) + else: + is_cached = True + file_location = self._model_path + + return [ + { + "model_version": self._model_spec.model_name, + "model_file_location": file_location, + "cache_status": is_cached, + } + ] + + +def generate_image_description( + image_model: VideoModelFamilyV1, +) -> Dict[str, List[Dict]]: + res = defaultdict(list) + res[image_model.model_name].extend( + VideoModelDescription(None, None, image_model).to_version_info() + ) + return res + + +def match_diffusion( + model_name: str, + download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None, +) -> VideoModelFamilyV1: + from ..utils import download_from_modelscope + from . import BUILTIN_VIDEO_MODELS, MODELSCOPE_VIDEO_MODELS + + if download_hub == "modelscope" and model_name in MODELSCOPE_VIDEO_MODELS: + logger.debug(f"Image model {model_name} found in ModelScope.") + return MODELSCOPE_VIDEO_MODELS[model_name] + elif download_hub == "huggingface" and model_name in BUILTIN_VIDEO_MODELS: + logger.debug(f"Image model {model_name} found in Huggingface.") + return BUILTIN_VIDEO_MODELS[model_name] + elif download_from_modelscope() and model_name in MODELSCOPE_VIDEO_MODELS: + logger.debug(f"Image model {model_name} found in ModelScope.") + return MODELSCOPE_VIDEO_MODELS[model_name] + elif model_name in BUILTIN_VIDEO_MODELS: + logger.debug(f"Image model {model_name} found in Huggingface.") + return BUILTIN_VIDEO_MODELS[model_name] + else: + raise ValueError( + f"Image model {model_name} not found, available" + f"model list: {BUILTIN_VIDEO_MODELS.keys()}" + ) + + +def cache(model_spec: VideoModelFamilyV1): + from ..utils import cache + + return cache(model_spec, VideoModelDescription) + + +def get_cache_dir(model_spec: VideoModelFamilyV1): + return os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name)) + + +def get_cache_status( + model_spec: VideoModelFamilyV1, +) -> bool: + cache_dir = get_cache_dir(model_spec) + meta_path = os.path.join(cache_dir, "__valid_download") + + model_name = model_spec.model_name + if model_name in BUILTIN_VIDEO_MODELS and model_name in MODELSCOPE_VIDEO_MODELS: + hf_spec = BUILTIN_VIDEO_MODELS[model_name] + ms_spec = MODELSCOPE_VIDEO_MODELS[model_name] + + return any( + [ + valid_model_revision(meta_path, hf_spec.model_revision), + valid_model_revision(meta_path, ms_spec.model_revision), + ] + ) + else: # Usually for UT + return valid_model_revision(meta_path, model_spec.model_revision) + + +def create_video_model_instance( + subpool_addr: str, + devices: List[str], + model_uid: str, + model_name: str, + download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None, + model_path: Optional[str] = None, + **kwargs, +) -> Tuple[DiffUsersVideoModel, VideoModelDescription]: + model_spec = match_diffusion(model_name, download_hub) + if not model_path: + model_path = cache(model_spec) + + model = DiffUsersVideoModel( + model_uid, + model_path, + model_spec, + **kwargs, + ) + model_description = VideoModelDescription( + subpool_addr, devices, model_spec, model_path=model_path + ) + return model, model_description diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py new file mode 100644 index 0000000000..4bf1205aed --- /dev/null +++ b/xinference/model/video/diffusers.py @@ -0,0 +1,100 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import sys +import uuid +import torch + +from ...constants import XINFERENCE_VIDEO_DIR +from ...device_utils import move_model_to_available_device + +logger = logging.getLogger(__name__) + + +class DiffUsersVideoModel: + def __init__( + self, + model_uid: str, + model_path: str, + model_spec: "VideoModelFamilyV1", + **kwargs, + ): + self._model_uid = model_uid + self._model_path = model_path + self._model_spec = model_spec + self._model = None + self._kwargs = kwargs + + def load(self): + import torch + + torch_dtype = self._kwargs.get("torch_dtype") + if sys.platform != "darwin" and torch_dtype is None: + # The following params crashes on Mac M2 + self._kwargs["torch_dtype"] = torch.float16 + self._kwargs["variant"] = "fp16" + self._kwargs["use_safetensors"] = True + if isinstance(torch_dtype, str): + self._kwargs["torch_dtype"] = getattr(torch, torch_dtype) + + if self._model_spec.model_family == "CogVideoX": + from diffusers import CogVideoXPipeline + + self._model = CogVideoXPipeline.from_pretrained( + self._model_path, torch_dtype=torch.float16 + ) + else: + raise Exception( + f"Unsupported model family: {self._model_spec.model_family}" + ) + + if self._kwargs.get("cpu_offload", False): + logger.debug("CPU offloading model") + self._model.enable_model_cpu_offload() + elif not self._kwargs.get("device_map"): + logger.debug("Loading model to available device") + self._model = move_model_to_available_device(self._model) + # Recommended if your computer has < 64 GB of RAM + self._model.enable_attention_slicing() + + def text_to_image( + self, + prompt: str, + n: int = 1, + **kwargs, + ): + from diffusers.utils import export_to_video + + logger.debug( + "diffusers args: %s", + kwargs, + ) + # assert callable(self._model) + prompt_embeds, _ = self._model.encode_prompt( + prompt=prompt, + do_classifier_free_guidance=True, + num_videos_per_prompt=n, + max_sequence_length=226, + device=self._model.device, + dtype=torch.float16, + ) + video = self._model( + num_inference_steps=50, + guidance_scale=6, + prompt_embeds=prompt_embeds, + ).frames[0] + path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".jpg") + export_to_video(video, path, fps=8) diff --git a/xinference/model/video/model_spec.json b/xinference/model/video/model_spec.json new file mode 100644 index 0000000000..52b748fd6a --- /dev/null +++ b/xinference/model/video/model_spec.json @@ -0,0 +1,11 @@ +[ + { + "model_name": "CogVideoX-2b", + "model_family": "CogVideoX", + "model_id": "THUDM/CogVideoX-2b", + "model_revision": "4bbfb1de622b80bc1b77b6e9aced75f816be0e38", + "model_ability": [ + "text2video" + ] + } +] diff --git a/xinference/model/video/model_spec_modelscope.json b/xinference/model/video/model_spec_modelscope.json new file mode 100644 index 0000000000..e3cb604921 --- /dev/null +++ b/xinference/model/video/model_spec_modelscope.json @@ -0,0 +1,12 @@ +[ + { + "model_name": "CogVideoX-2b", + "model_family": "CogVideoX", + "model_hub": "modelscope", + "model_id": "ZhipuAI/CogVideoX-2b", + "model_revision": "master", + "model_ability": [ + "text2video" + ] + } +] diff --git a/xinference/model/video/tests/__init__.py b/xinference/model/video/tests/__init__.py new file mode 100644 index 0000000000..37f6558d95 --- /dev/null +++ b/xinference/model/video/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/xinference/model/video/tests/test_diffusers_video.py b/xinference/model/video/tests/test_diffusers_video.py new file mode 100644 index 0000000000..b99f478d02 --- /dev/null +++ b/xinference/model/video/tests/test_diffusers_video.py @@ -0,0 +1,37 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import base64 +import logging +import os.path +import shutil +from io import BytesIO + +from PIL import Image + +from ..core import cache +from ..diffusers import DiffUsersVideoModel +from .. import BUILTIN_VIDEO_MODELS + + +logger = logging.getLogger(__name__) + + +def test_model(): + test_model_spec = next(iter(BUILTIN_VIDEO_MODELS.values())) + model_path = cache(test_model_spec) + model = DiffUsersVideoModel("mock", model_path, test_model_spec) + # input is a string + input_text = "an apple" + model.load() + r = model.text_to_image(input_text) From f24ff2da288ae54791247e8be5fdf3981bdb1465 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Thu, 8 Aug 2024 21:19:27 +0200 Subject: [PATCH 02/15] List video for /v1/model_registrations/video --- xinference/api/restful_api.py | 40 +++++++++++++++++++++ xinference/client/restful/restful_client.py | 39 ++++++++++++++++++++ xinference/core/model.py | 21 +++++++++++ xinference/core/supervisor.py | 37 +++++++++++++++++++ xinference/core/worker.py | 2 ++ xinference/model/core.py | 12 +++++++ xinference/model/video/diffusers.py | 16 ++++++--- xinference/types.py | 5 +++ 8 files changed, 167 insertions(+), 5 deletions(-) diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py index 87e19d98cf..f330f58e19 100644 --- a/xinference/api/restful_api.py +++ b/xinference/api/restful_api.py @@ -123,6 +123,14 @@ class TextToImageRequest(BaseModel): user: Optional[str] = None +class TextToVideoRequest(BaseModel): + model: str + prompt: Union[str, List[str]] = Field(description="The input to embed.") + n: Optional[int] = 1 + kwargs: Optional[str] = None + user: Optional[str] = None + + class SpeechRequest(BaseModel): model: str input: str @@ -1546,6 +1554,38 @@ async def create_flexible_infer(self, request: Request) -> Response: await self._report_error_event(model_uid, str(e)) raise HTTPException(status_code=500, detail=str(e)) + async def create_videos(self, request: Request) -> Response: + body = TextToVideoRequest.parse_obj(await request.json()) + model_uid = body.model + try: + model = await (await self._get_supervisor_ref()).get_model(model_uid) + except ValueError as ve: + logger.error(str(ve), exc_info=True) + await self._report_error_event(model_uid, str(ve)) + raise HTTPException(status_code=400, detail=str(ve)) + except Exception as e: + logger.error(e, exc_info=True) + await self._report_error_event(model_uid, str(e)) + raise HTTPException(status_code=500, detail=str(e)) + + try: + kwargs = json.loads(body.kwargs) if body.kwargs else {} + video_list = await model.text_to_video( + prompt=body.prompt, + n=body.n, + **kwargs, + ) + return Response(content=video_list, media_type="application/json") + except RuntimeError as re: + logger.error(re, exc_info=True) + await self._report_error_event(model_uid, str(re)) + self.handle_request_limit_error(re) + raise HTTPException(status_code=400, detail=str(re)) + except Exception as e: + logger.error(e, exc_info=True) + await self._report_error_event(model_uid, str(e)) + raise HTTPException(status_code=500, detail=str(e)) + async def create_chat_completion(self, request: Request) -> Response: raw_body = await request.json() body = CreateChatCompletion.parse_obj(raw_body) diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py index aa0955f75d..c4368368d2 100644 --- a/xinference/client/restful/restful_client.py +++ b/xinference/client/restful/restful_client.py @@ -29,6 +29,7 @@ CompletionChunk, Embedding, ImageList, + VideoList, LlamaCppGenerateConfig, PytorchGenerateConfig, ) @@ -370,6 +371,44 @@ def inpainting( return response_data +class RESTfulVideoModelHandle(RESTfulModelHandle): + def text_to_video( + self, + prompt: str, + n: int = 1, + **kwargs, + ) -> "VideoList": + """ + Creates an image by the input text. + + Parameters + ---------- + prompt: `str` or `List[str]` + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + n: `int`, defaults to 1 + The number of images to generate per prompt. Must be between 1 and 10. + Returns + ------- + ImageList + A list of image objects. + """ + url = f"{self._base_url}/v1/images/generations" + request_body = { + "model": self._model_uid, + "prompt": prompt, + "n": n, + "kwargs": json.dumps(kwargs), + } + response = requests.post(url, json=request_body, headers=self.auth_headers) + if response.status_code != 200: + raise RuntimeError( + f"Failed to create the images, detail: {_get_error_string(response)}" + ) + + response_data = response.json() + return response_data + + class RESTfulGenerateModelHandle(RESTfulModelHandle): def generate( self, diff --git a/xinference/core/model.py b/xinference/core/model.py index 7fc41b9c53..24cfe3c6e8 100644 --- a/xinference/core/model.py +++ b/xinference/core/model.py @@ -774,6 +774,27 @@ async def infer( f"Model {self._model.model_spec} is not for flexible infer." ) + @log_async(logger=logger) + @request_limit + async def text_to_video( + self, + prompt: str, + n: int = 1, + *args, + **kwargs, + ): + if hasattr(self._model, "text_to_video"): + return await self._call_wrapper_json( + self._model.text_to_video, + prompt, + n, + *args, + **kwargs, + ) + raise AttributeError( + f"Model {self._model.model_spec} is not for creating video." + ) + async def record_metrics(self, name, op, kwargs): worker_ref = await self._get_worker_ref() await worker_ref.record_metrics(name, op, kwargs) diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py index 54e4b65849..272f4c23cb 100644 --- a/xinference/core/supervisor.py +++ b/xinference/core/supervisor.py @@ -59,6 +59,7 @@ if TYPE_CHECKING: from ..model.audio import AudioModelFamilyV1 + from ..model.video import VideoModelFamilyV1 from ..model.embedding import EmbeddingModelSpec from ..model.flexible import FlexibleModelSpec from ..model.image import ImageModelFamilyV1 @@ -484,6 +485,31 @@ async def _to_audio_model_reg( res["model_instance_count"] = instance_cnt return res + async def _to_video_model_reg( + self, model_family: "VideoModelFamilyV1", is_builtin: bool + ) -> Dict[str, Any]: + from ..model.video import get_cache_status + + instance_cnt = await self.get_instance_count(model_family.model_name) + version_cnt = await self.get_model_version_count(model_family.model_name) + + if self.is_local_deployment(): + # TODO: does not work when the supervisor and worker are running on separate nodes. + cache_status = get_cache_status(model_family) + res = { + **model_family.dict(), + "cache_status": cache_status, + "is_builtin": is_builtin, + } + else: + res = { + **model_family.dict(), + "is_builtin": is_builtin, + } + res["model_version_count"] = version_cnt + res["model_instance_count"] = instance_cnt + return res + async def _to_flexible_model_reg( self, model_spec: "FlexibleModelSpec", is_builtin: bool ) -> Dict[str, Any]: @@ -602,6 +628,17 @@ def sort_helper(item): {"model_name": model_spec.model_name, "is_builtin": False} ) + ret.sort(key=sort_helper) + return ret + elif model_type == "video": + from ..model.video import BUILTIN_VIDEO_MODELS + + for model_name, family in BUILTIN_VIDEO_MODELS.items(): + if detailed: + ret.append(await self._to_video_model_reg(family, is_builtin=True)) + else: + ret.append({"model_name": model_name, "is_builtin": True}) + ret.sort(key=sort_helper) return ret elif model_type == "rerank": diff --git a/xinference/core/worker.py b/xinference/core/worker.py index 9524bd604a..cfffd7fb17 100644 --- a/xinference/core/worker.py +++ b/xinference/core/worker.py @@ -735,6 +735,8 @@ async def _get_model_ability(self, model: Any, model_type: str) -> List[str]: return ["text_to_image"] elif model_type == "audio": return ["audio_to_text"] + elif model_type == "video": + return ["text_to_video"] elif model_type == "flexible": return ["flexible"] else: diff --git a/xinference/model/core.py b/xinference/model/core.py index 09cb4104a4..4df04e621f 100644 --- a/xinference/model/core.py +++ b/xinference/model/core.py @@ -60,6 +60,7 @@ def create_model_instance( **kwargs, ) -> Tuple[Any, ModelDescription]: from .audio.core import create_audio_model_instance + from .video.core import create_video_model_instance from .embedding.core import create_embedding_model_instance from .flexible.core import create_flexible_model_instance from .image.core import create_image_model_instance @@ -127,6 +128,17 @@ def create_model_instance( model_path, **kwargs, ) + elif model_type == "video": + kwargs.pop("trust_remote_code", None) + return create_video_model_instance( + subpool_addr, + devices, + model_uid, + model_name, + download_hub, + model_path, + **kwargs, + ) elif model_type == "flexible": kwargs.pop("trust_remote_code", None) return create_flexible_model_instance( diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py index 4bf1205aed..b629e81807 100644 --- a/xinference/model/video/diffusers.py +++ b/xinference/model/video/diffusers.py @@ -17,9 +17,11 @@ import sys import uuid import torch +import time from ...constants import XINFERENCE_VIDEO_DIR from ...device_utils import move_model_to_available_device +from ...types import VideoList logger = logging.getLogger(__name__) @@ -75,7 +77,7 @@ def text_to_image( prompt: str, n: int = 1, **kwargs, - ): + ) -> VideoList: from diffusers.utils import export_to_video logger.debug( @@ -91,10 +93,14 @@ def text_to_image( device=self._model.device, dtype=torch.float16, ) - video = self._model( + output = self._model( num_inference_steps=50, guidance_scale=6, prompt_embeds=prompt_embeds, - ).frames[0] - path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".jpg") - export_to_video(video, path, fps=8) + ) + urls = [] + for f in output.frames: + path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".jpg") + p = export_to_video(f, path, fps=8) + urls.append(p) + return VideoList(created=int(time.time()), data=urls) diff --git a/xinference/types.py b/xinference/types.py index e66e90bee1..4e1106ae51 100644 --- a/xinference/types.py +++ b/xinference/types.py @@ -52,6 +52,11 @@ class ImageList(TypedDict): data: List[Image] +class VideoList(TypedDict): + created: int + data: List[str] + + class EmbeddingUsage(TypedDict): prompt_tokens: int total_tokens: int From 5248b297a884e5b1ff8822c474aaf61e3cef8fa3 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Thu, 8 Aug 2024 21:21:48 +0200 Subject: [PATCH 03/15] Add restful client --- xinference/api/restful_api.py | 12 ++++++++++++ xinference/client/restful/restful_client.py | 6 +++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py index f330f58e19..31fac8be86 100644 --- a/xinference/api/restful_api.py +++ b/xinference/api/restful_api.py @@ -64,6 +64,7 @@ CreateChatCompletion, CreateCompletion, ImageList, + VideoList, PeftModelConfig, max_tokens_field, ) @@ -520,6 +521,17 @@ async def internal_exception_handler(request: Request, exc: Exception): else None ), ) + self._router.add_api_route( + "/v1/video/generations", + self.create_videos, + methods=["POST"], + response_model=VideoList, + dependencies=( + [Security(self._auth_service, scopes=["models:read"])] + if self.is_authenticated() + else None + ), + ) self._router.add_api_route( "/v1/chat/completions", self.create_chat_completion, diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py index c4368368d2..987ddac268 100644 --- a/xinference/client/restful/restful_client.py +++ b/xinference/client/restful/restful_client.py @@ -392,7 +392,7 @@ def text_to_video( ImageList A list of image objects. """ - url = f"{self._base_url}/v1/images/generations" + url = f"{self._base_url}/v1/video/generations" request_body = { "model": self._model_uid, "prompt": prompt, @@ -1054,6 +1054,10 @@ def get_model(self, model_uid: str) -> RESTfulModelHandle: return RESTfulAudioModelHandle( model_uid, self.base_url, auth_headers=self._headers ) + elif desc["model_type"] == "video": + return RESTfulVideoModelHandle( + model_uid, self.base_url, auth_headers=self._headers + ) elif desc["model_type"] == "flexible": return RESTfulFlexibleModelHandle( model_uid, self.base_url, auth_headers=self._headers From bc37a0987c6d0ab3eab6796c03c21421a61c1bcc Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Thu, 8 Aug 2024 21:52:00 +0200 Subject: [PATCH 04/15] Fix --- xinference/model/video/__init__.py | 6 +++--- xinference/model/video/core.py | 22 +++++++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/xinference/model/video/__init__.py b/xinference/model/video/__init__.py index 45f86f36be..68bec15379 100644 --- a/xinference/model/video/__init__.py +++ b/xinference/model/video/__init__.py @@ -23,9 +23,9 @@ MODEL_NAME_TO_REVISION, MODELSCOPE_VIDEO_MODELS, VideoModelFamilyV1, - generate_image_description, + generate_video_description, get_cache_status, - get_image_model_descriptions, + get_video_model_descriptions, ) @@ -57,7 +57,7 @@ for model_name, model_spec in chain( MODELSCOPE_VIDEO_MODELS.items(), BUILTIN_VIDEO_MODELS.items() ): - VIDEO_MODEL_DESCRIPTIONS.update(generate_image_description(model_spec)) + VIDEO_MODEL_DESCRIPTIONS.update(generate_video_description(model_spec)) del _model_spec_json del _model_spec_modelscope_json diff --git a/xinference/model/video/core.py b/xinference/model/video/core.py index 8a9f15ef95..f2b0a3cce2 100644 --- a/xinference/model/video/core.py +++ b/xinference/model/video/core.py @@ -31,7 +31,7 @@ MODELSCOPE_VIDEO_MODELS: Dict[str, "VideoModelFamilyV1"] = {} -def get_image_model_descriptions(): +def get_video_model_descriptions(): import copy return copy.deepcopy(VIDEO_MODEL_DESCRIPTIONS) @@ -59,7 +59,7 @@ def __init__( def to_dict(self): return { - "model_type": "image", + "model_type": "video", "address": self.address, "accelerators": self.devices, "model_name": self._model_spec.model_name, @@ -85,12 +85,12 @@ def to_version_info(self): ] -def generate_image_description( - image_model: VideoModelFamilyV1, +def generate_video_description( + video_model: VideoModelFamilyV1, ) -> Dict[str, List[Dict]]: res = defaultdict(list) - res[image_model.model_name].extend( - VideoModelDescription(None, None, image_model).to_version_info() + res[video_model.model_name].extend( + VideoModelDescription(None, None, video_model).to_version_info() ) return res @@ -103,20 +103,20 @@ def match_diffusion( from . import BUILTIN_VIDEO_MODELS, MODELSCOPE_VIDEO_MODELS if download_hub == "modelscope" and model_name in MODELSCOPE_VIDEO_MODELS: - logger.debug(f"Image model {model_name} found in ModelScope.") + logger.debug(f"Video model {model_name} found in ModelScope.") return MODELSCOPE_VIDEO_MODELS[model_name] elif download_hub == "huggingface" and model_name in BUILTIN_VIDEO_MODELS: - logger.debug(f"Image model {model_name} found in Huggingface.") + logger.debug(f"Video model {model_name} found in Huggingface.") return BUILTIN_VIDEO_MODELS[model_name] elif download_from_modelscope() and model_name in MODELSCOPE_VIDEO_MODELS: - logger.debug(f"Image model {model_name} found in ModelScope.") + logger.debug(f"Video model {model_name} found in ModelScope.") return MODELSCOPE_VIDEO_MODELS[model_name] elif model_name in BUILTIN_VIDEO_MODELS: - logger.debug(f"Image model {model_name} found in Huggingface.") + logger.debug(f"Video model {model_name} found in Huggingface.") return BUILTIN_VIDEO_MODELS[model_name] else: raise ValueError( - f"Image model {model_name} not found, available" + f"Video model {model_name} not found, available" f"model list: {BUILTIN_VIDEO_MODELS.keys()}" ) From ed92c9ef35c717c4e2b963a66955769334d6264d Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Thu, 8 Aug 2024 21:52:30 +0200 Subject: [PATCH 05/15] Add running models UI --- .../web/ui/src/scenes/launch_model/index.js | 4 ++++ .../web/ui/src/scenes/running_models/index.js | 24 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/xinference/web/ui/src/scenes/launch_model/index.js b/xinference/web/ui/src/scenes/launch_model/index.js index 1339e94d4f..55f05747bd 100644 --- a/xinference/web/ui/src/scenes/launch_model/index.js +++ b/xinference/web/ui/src/scenes/launch_model/index.js @@ -69,6 +69,7 @@ const LaunchModel = () => { + @@ -93,6 +94,9 @@ const LaunchModel = () => { + + + diff --git a/xinference/web/ui/src/scenes/running_models/index.js b/xinference/web/ui/src/scenes/running_models/index.js index e91858f2fd..9f9486651a 100644 --- a/xinference/web/ui/src/scenes/running_models/index.js +++ b/xinference/web/ui/src/scenes/running_models/index.js @@ -21,6 +21,7 @@ const RunningModels = () => { const [embeddingModelData, setEmbeddingModelData] = useState([]) const [imageModelData, setImageModelData] = useState([]) const [audioModelData, setAudioModelData] = useState([]) + const [videoModelData, setVideoModelData] = useState([]) const [rerankModelData, setRerankModelData] = useState([]) const [flexibleModelData, setFlexibleModelData] = useState([]) const { isCallingApi, setIsCallingApi } = useContext(ApiContext) @@ -53,6 +54,9 @@ const RunningModels = () => { setAudioModelData([ { id: 'Loading, do not refresh page...', url: 'IS_LOADING' }, ]) + setVideoModelData([ + { id: 'Loading, do not refresh page...', url: 'IS_LOADING' }, + ]) setImageModelData([ { id: 'Loading, do not refresh page...', url: 'IS_LOADING' }, ]) @@ -72,6 +76,7 @@ const RunningModels = () => { const newEmbeddingModelData = [] const newImageModelData = [] const newAudioModelData = [] + const newVideoModelData = [] const newRerankModelData = [] const newFlexibleModelData = [] response.data.forEach((model) => { @@ -86,6 +91,8 @@ const RunningModels = () => { newEmbeddingModelData.push(newValue) } else if (newValue.model_type === 'audio') { newAudioModelData.push(newValue) + } else if (newValue.model_type === 'video') { + newVideoModelData.push(newValue) } else if (newValue.model_type === 'image') { newImageModelData.push(newValue) } else if (newValue.model_type === 'rerank') { @@ -97,6 +104,7 @@ const RunningModels = () => { setLlmData(newLlmData) setEmbeddingModelData(newEmbeddingModelData) setAudioModelData(newAudioModelData) + setVideoModelData(newVideoModelData) setImageModelData(newImageModelData) setRerankModelData(newRerankModelData) setFlexibleModelData(newFlexibleModelData) @@ -591,6 +599,7 @@ const RunningModels = () => { }, ] const audioModelColumns = embeddingModelColumns + const videoModelColumns = embeddingModelColumns const rerankModelColumns = embeddingModelColumns const flexibleModelColumns = embeddingModelColumns @@ -652,6 +661,7 @@ const RunningModels = () => { + @@ -725,6 +735,20 @@ const RunningModels = () => { /> + + + + + Date: Thu, 8 Aug 2024 22:05:33 +0200 Subject: [PATCH 06/15] Fix --- xinference/model/video/diffusers.py | 10 +++++-- .../model/video/tests/test_diffusers_video.py | 29 +++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py index b629e81807..94dde86aac 100644 --- a/xinference/model/video/diffusers.py +++ b/xinference/model/video/diffusers.py @@ -40,6 +40,10 @@ def __init__( self._model = None self._kwargs = kwargs + @property + def model_spec(self): + return self._model_spec + def load(self): import torch @@ -72,7 +76,7 @@ def load(self): # Recommended if your computer has < 64 GB of RAM self._model.enable_attention_slicing() - def text_to_image( + def text_to_video( self, prompt: str, n: int = 1, @@ -81,7 +85,7 @@ def text_to_image( from diffusers.utils import export_to_video logger.debug( - "diffusers args: %s", + "diffusers text_to_video args: %s", kwargs, ) # assert callable(self._model) @@ -100,7 +104,7 @@ def text_to_image( ) urls = [] for f in output.frames: - path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".jpg") + path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".mp4") p = export_to_video(f, path, fps=8) urls.append(p) return VideoList(created=int(time.time()), data=urls) diff --git a/xinference/model/video/tests/test_diffusers_video.py b/xinference/model/video/tests/test_diffusers_video.py index b99f478d02..65821f9046 100644 --- a/xinference/model/video/tests/test_diffusers_video.py +++ b/xinference/model/video/tests/test_diffusers_video.py @@ -35,3 +35,32 @@ def test_model(): input_text = "an apple" model.load() r = model.text_to_image(input_text) + assert r + + +def test_client(setup): + endpoint, _ = setup + from ....client import Client + + client = Client(endpoint) + + model_uid = client.launch_model( + model_uid="my_video_model", + model_name="CogVideoX-2b", + model_type="video", + ) + model = client.get_model(model_uid) + assert model + + r = model.text_to_video( + prompt="A panda, dressed in a small, red jacket and a tiny hat, " + "sits on a wooden stool in a serene bamboo forest. " + "The panda's fluffy paws strum a miniature acoustic guitar, " + "producing soft, melodic tunes. Nearby, a few other pandas gather, " + "watching curiously and some clapping in rhythm. " + "Sunlight filters through the tall bamboo, casting a gentle glow on the scene. " + "The panda's face is expressive, showing concentration and joy as it plays. " + "The background includes a small, flowing stream and vibrant green foliage, " + "enhancing the peaceful and magical atmosphere of this unique musical performance." + ) + print(r) From ca57593137e9870d7c6ff37c6de16a403a937d21 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Thu, 8 Aug 2024 22:07:36 +0200 Subject: [PATCH 07/15] Fix lint --- xinference/api/restful_api.py | 2 +- xinference/client/restful/restful_client.py | 2 +- xinference/core/supervisor.py | 2 +- xinference/model/core.py | 2 +- xinference/model/video/__init__.py | 3 +-- xinference/model/video/diffusers.py | 3 ++- xinference/model/video/tests/test_diffusers_video.py | 3 +-- 7 files changed, 8 insertions(+), 9 deletions(-) diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py index 31fac8be86..47b4848c80 100644 --- a/xinference/api/restful_api.py +++ b/xinference/api/restful_api.py @@ -64,8 +64,8 @@ CreateChatCompletion, CreateCompletion, ImageList, - VideoList, PeftModelConfig, + VideoList, max_tokens_field, ) from .oauth2.auth_service import AuthService diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py index 987ddac268..ca96d5a785 100644 --- a/xinference/client/restful/restful_client.py +++ b/xinference/client/restful/restful_client.py @@ -29,9 +29,9 @@ CompletionChunk, Embedding, ImageList, - VideoList, LlamaCppGenerateConfig, PytorchGenerateConfig, + VideoList, ) diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py index 272f4c23cb..2b6f7b9fc5 100644 --- a/xinference/core/supervisor.py +++ b/xinference/core/supervisor.py @@ -59,12 +59,12 @@ if TYPE_CHECKING: from ..model.audio import AudioModelFamilyV1 - from ..model.video import VideoModelFamilyV1 from ..model.embedding import EmbeddingModelSpec from ..model.flexible import FlexibleModelSpec from ..model.image import ImageModelFamilyV1 from ..model.llm import LLMFamilyV1 from ..model.rerank import RerankModelSpec + from ..model.video import VideoModelFamilyV1 from .worker import WorkerActor diff --git a/xinference/model/core.py b/xinference/model/core.py index 4df04e621f..4591d255b0 100644 --- a/xinference/model/core.py +++ b/xinference/model/core.py @@ -60,12 +60,12 @@ def create_model_instance( **kwargs, ) -> Tuple[Any, ModelDescription]: from .audio.core import create_audio_model_instance - from .video.core import create_video_model_instance from .embedding.core import create_embedding_model_instance from .flexible.core import create_flexible_model_instance from .image.core import create_image_model_instance from .llm.core import create_llm_model_instance from .rerank.core import create_rerank_model_instance + from .video.core import create_video_model_instance if model_type == "LLM": return create_llm_model_instance( diff --git a/xinference/model/video/__init__.py b/xinference/model/video/__init__.py index 68bec15379..e1325b0bbb 100644 --- a/xinference/model/video/__init__.py +++ b/xinference/model/video/__init__.py @@ -19,16 +19,15 @@ from .core import ( BUILTIN_VIDEO_MODELS, - VIDEO_MODEL_DESCRIPTIONS, MODEL_NAME_TO_REVISION, MODELSCOPE_VIDEO_MODELS, + VIDEO_MODEL_DESCRIPTIONS, VideoModelFamilyV1, generate_video_description, get_cache_status, get_video_model_descriptions, ) - _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json") _model_spec_modelscope_json = os.path.join( os.path.dirname(__file__), "model_spec_modelscope.json" diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py index 94dde86aac..d14a811de6 100644 --- a/xinference/model/video/diffusers.py +++ b/xinference/model/video/diffusers.py @@ -15,9 +15,10 @@ import logging import os import sys +import time import uuid + import torch -import time from ...constants import XINFERENCE_VIDEO_DIR from ...device_utils import move_model_to_available_device diff --git a/xinference/model/video/tests/test_diffusers_video.py b/xinference/model/video/tests/test_diffusers_video.py index 65821f9046..2982486f99 100644 --- a/xinference/model/video/tests/test_diffusers_video.py +++ b/xinference/model/video/tests/test_diffusers_video.py @@ -19,10 +19,9 @@ from PIL import Image +from .. import BUILTIN_VIDEO_MODELS from ..core import cache from ..diffusers import DiffUsersVideoModel -from .. import BUILTIN_VIDEO_MODELS - logger = logging.getLogger(__name__) From 00aa459ec24433aeceac7b77e02551b9b41e0a9b Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Thu, 8 Aug 2024 22:12:36 +0200 Subject: [PATCH 08/15] Fix lint --- xinference/client/restful/restful_client.py | 10 +++++----- xinference/model/video/tests/test_diffusers_video.py | 6 ------ 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py index ca96d5a785..24ba0b7038 100644 --- a/xinference/client/restful/restful_client.py +++ b/xinference/client/restful/restful_client.py @@ -379,18 +379,18 @@ def text_to_video( **kwargs, ) -> "VideoList": """ - Creates an image by the input text. + Creates a video by the input text. Parameters ---------- prompt: `str` or `List[str]` - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + The prompt or prompts to guide video generation. If not defined, you need to pass `prompt_embeds`. n: `int`, defaults to 1 - The number of images to generate per prompt. Must be between 1 and 10. + The number of videos to generate per prompt. Must be between 1 and 10. Returns ------- - ImageList - A list of image objects. + VideoList + A list of video objects. """ url = f"{self._base_url}/v1/video/generations" request_body = { diff --git a/xinference/model/video/tests/test_diffusers_video.py b/xinference/model/video/tests/test_diffusers_video.py index 2982486f99..b5ca5dcb6f 100644 --- a/xinference/model/video/tests/test_diffusers_video.py +++ b/xinference/model/video/tests/test_diffusers_video.py @@ -11,13 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import base64 import logging -import os.path -import shutil -from io import BytesIO - -from PIL import Image from .. import BUILTIN_VIDEO_MODELS from ..core import cache From db4c9828f620e975d677d61c9cb8f8976f000844 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Thu, 8 Aug 2024 22:15:13 +0200 Subject: [PATCH 09/15] Fix typing --- xinference/model/video/diffusers.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py index d14a811de6..85ab1cde8d 100644 --- a/xinference/model/video/diffusers.py +++ b/xinference/model/video/diffusers.py @@ -17,6 +17,7 @@ import sys import time import uuid +from typing import TYPE_CHECKING import torch @@ -24,6 +25,10 @@ from ...device_utils import move_model_to_available_device from ...types import VideoList +if TYPE_CHECKING: + from .core import VideoModelFamilyV1 + + logger = logging.getLogger(__name__) From 2308d2345cb91cb31cf23b2bdb2e142dbc629c97 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Thu, 8 Aug 2024 22:23:45 +0200 Subject: [PATCH 10/15] Fix lint --- xinference/model/video/core.py | 1 + xinference/model/video/diffusers.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/xinference/model/video/core.py b/xinference/model/video/core.py index f2b0a3cce2..3b9f96ad9a 100644 --- a/xinference/model/video/core.py +++ b/xinference/model/video/core.py @@ -164,6 +164,7 @@ def create_video_model_instance( model_spec = match_diffusion(model_name, download_hub) if not model_path: model_path = cache(model_spec) + assert model_path is not None model = DiffUsersVideoModel( model_uid, diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py index 85ab1cde8d..930cefa09e 100644 --- a/xinference/model/video/diffusers.py +++ b/xinference/model/video/diffusers.py @@ -94,7 +94,7 @@ def text_to_video( "diffusers text_to_video args: %s", kwargs, ) - # assert callable(self._model) + assert self._model is not None prompt_embeds, _ = self._model.encode_prompt( prompt=prompt, do_classifier_free_guidance=True, @@ -103,6 +103,7 @@ def text_to_video( device=self._model.device, dtype=torch.float16, ) + assert callable(self._model) output = self._model( num_inference_steps=50, guidance_scale=6, From f5cbbd6ec7da997fef8a5253ff796ee11ede3a39 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Thu, 8 Aug 2024 22:25:13 +0200 Subject: [PATCH 11/15] Fix --- xinference/model/video/tests/test_diffusers_video.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xinference/model/video/tests/test_diffusers_video.py b/xinference/model/video/tests/test_diffusers_video.py index b5ca5dcb6f..3676612c05 100644 --- a/xinference/model/video/tests/test_diffusers_video.py +++ b/xinference/model/video/tests/test_diffusers_video.py @@ -13,6 +13,8 @@ # limitations under the License. import logging +import pytest + from .. import BUILTIN_VIDEO_MODELS from ..core import cache from ..diffusers import DiffUsersVideoModel @@ -20,6 +22,7 @@ logger = logging.getLogger(__name__) +@pytest.mark.skip(reason="Video model requires too many GRAM.") def test_model(): test_model_spec = next(iter(BUILTIN_VIDEO_MODELS.values())) model_path = cache(test_model_spec) @@ -31,6 +34,7 @@ def test_model(): assert r +@pytest.mark.skip(reason="Video model requires too many GRAM.") def test_client(setup): endpoint, _ = setup from ....client import Client @@ -56,4 +60,4 @@ def test_client(setup): "The background includes a small, flowing stream and vibrant green foliage, " "enhancing the peaceful and magical atmosphere of this unique musical performance." ) - print(r) + assert r From 6b1a4dd3b620fa53d7ef9c8283785c6f2b72ab5d Mon Sep 17 00:00:00 2001 From: qinxuye Date: Fri, 9 Aug 2024 07:10:56 +0000 Subject: [PATCH 12/15] fix --- setup.cfg | 4 + xinference/client/restful/restful_client.py | 2 +- xinference/deploy/docker/requirements.txt | 1 + xinference/deploy/docker/requirements_cpu.txt | 1 + xinference/model/video/diffusers.py | 87 +++++++++++++++++-- xinference/types.py | 7 +- 6 files changed, 91 insertions(+), 11 deletions(-) diff --git a/setup.cfg b/setup.cfg index 05e33f9b5a..8664c59fdd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -95,6 +95,7 @@ all = sentence-transformers>=2.7.0 vllm>=0.2.6 ; sys_platform=='linux' diffusers>=0.25.0 # fix conflict with matcha-tts + imageio-ffmpeg # For video controlnet_aux orjson auto-gptq ; sys_platform!='darwin' @@ -158,6 +159,9 @@ rerank = image = diffusers>=0.25.0 # fix conflict with matcha-tts controlnet_aux +video = + diffusers + imageio-ffmpeg audio = funasr omegaconf~=2.3.0 diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py index 24ba0b7038..c11c30c29f 100644 --- a/xinference/client/restful/restful_client.py +++ b/xinference/client/restful/restful_client.py @@ -402,7 +402,7 @@ def text_to_video( response = requests.post(url, json=request_body, headers=self.auth_headers) if response.status_code != 200: raise RuntimeError( - f"Failed to create the images, detail: {_get_error_string(response)}" + f"Failed to create the video, detail: {_get_error_string(response)}" ) response_data = response.json() diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt index 66f6d650af..1830a7de25 100644 --- a/xinference/deploy/docker/requirements.txt +++ b/xinference/deploy/docker/requirements.txt @@ -60,6 +60,7 @@ onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows' # Fo openai-whisper # For CosyVoice boto3>=1.28.55,<1.28.65 # For tensorizer tensorizer~=2.9.0 +imageio-ffmpeg # For video # sglang outlines>=0.0.44 diff --git a/xinference/deploy/docker/requirements_cpu.txt b/xinference/deploy/docker/requirements_cpu.txt index a117e0c549..7ae0a2544d 100644 --- a/xinference/deploy/docker/requirements_cpu.txt +++ b/xinference/deploy/docker/requirements_cpu.txt @@ -55,3 +55,4 @@ matcha-tts # For CosyVoice onnxruntime-gpu==1.16.0; sys_platform == 'linux' # For CosyVoice onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows' # For CosyVoice openai-whisper # For CosyVoice +imageio-ffmpeg # For video diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py index 930cefa09e..0cd951aa62 100644 --- a/xinference/model/video/diffusers.py +++ b/xinference/model/video/diffusers.py @@ -12,18 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. +import base64 import logging import os import sys +import tempfile import time import uuid -from typing import TYPE_CHECKING +from concurrent.futures import ThreadPoolExecutor +from functools import partial +from io import BytesIO +from typing import TYPE_CHECKING, List, Optional, Union +import numpy as np +import PIL.Image import torch from ...constants import XINFERENCE_VIDEO_DIR from ...device_utils import move_model_to_available_device -from ...types import VideoList +from ...types import Video, VideoList if TYPE_CHECKING: from .core import VideoModelFamilyV1 @@ -32,6 +39,26 @@ logger = logging.getLogger(__name__) +def export_to_video_imageio( + video_frames: Union[List[np.ndarray], List["PIL.Image.Image"]], + output_video_path: Optional[str] = None, + fps: int = 8, +) -> str: + """ + Export the video frames to a video file using imageio lib to Avoid "green screen" issue (for example CogVideoX) + """ + import imageio + + if output_video_path is None: + output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name + if isinstance(video_frames[0], PIL.Image.Image): + video_frames = [np.array(frame) for frame in video_frames] + with imageio.get_writer(output_video_path, fps=fps) as writer: + for frame in video_frames: + writer.append_data(frame) + return output_video_path + + class DiffUsersVideoModel: def __init__( self, @@ -66,7 +93,7 @@ def load(self): from diffusers import CogVideoXPipeline self._model = CogVideoXPipeline.from_pretrained( - self._model_path, torch_dtype=torch.float16 + self._model_path, **self._kwargs ) else: raise Exception( @@ -86,32 +113,74 @@ def text_to_video( self, prompt: str, n: int = 1, + num_inference_steps: int = 50, + guidance_scale: int = 6, + response_format: str = "b64_json", **kwargs, ) -> VideoList: - from diffusers.utils import export_to_video + import gc + + # cv2 bug will cause the video cannot be normally displayed + # thus we use the imageio one + # from diffusers.utils import export_to_video + from ...device_utils import empty_cache logger.debug( "diffusers text_to_video args: %s", kwargs, ) assert self._model is not None + if self._kwargs.get("cpu_offload"): + # if enabled cpu offload, + # the model.device would be CPU + device = "cuda" + else: + device = self._model.device prompt_embeds, _ = self._model.encode_prompt( prompt=prompt, do_classifier_free_guidance=True, num_videos_per_prompt=n, max_sequence_length=226, - device=self._model.device, + device=device, dtype=torch.float16, ) assert callable(self._model) output = self._model( - num_inference_steps=50, - guidance_scale=6, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, prompt_embeds=prompt_embeds, + **kwargs, ) + + # clean cache + gc.collect() + empty_cache() + + os.makedirs(XINFERENCE_VIDEO_DIR, exist_ok=True) urls = [] for f in output.frames: path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".mp4") - p = export_to_video(f, path, fps=8) + p = export_to_video_imageio(f, path, fps=8) urls.append(p) - return VideoList(created=int(time.time()), data=urls) + if response_format == "url": + return VideoList( + created=int(time.time()), + data=[Video(url=url, b64_json=None) for url in urls], + ) + elif response_format == "b64_json": + + def _gen_base64_video(_video_url): + try: + with open(_video_url, "rb") as f: + buffered = BytesIO() + buffered.write(f.read()) + return base64.b64encode(buffered.getvalue()).decode() + finally: + os.remove(_video_url) + + with ThreadPoolExecutor() as executor: + results = list(map(partial(executor.submit, _gen_base64_video), urls)) # type: ignore + video_list = [Video(url=None, b64_json=s.result()) for s in results] + return VideoList(created=int(time.time()), data=video_list) + else: + raise ValueError(f"Unsupported response format: {response_format}") diff --git a/xinference/types.py b/xinference/types.py index 4e1106ae51..3f636d94c3 100644 --- a/xinference/types.py +++ b/xinference/types.py @@ -52,9 +52,14 @@ class ImageList(TypedDict): data: List[Image] +class Video(TypedDict): + url: Optional[str] + b64_json: Optional[str] + + class VideoList(TypedDict): created: int - data: List[str] + data: List[Video] class EmbeddingUsage(TypedDict): From 2878e85d739d10d3f7ff318f6e2206817567b816 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Fri, 9 Aug 2024 10:05:16 +0200 Subject: [PATCH 13/15] Fix --- xinference/model/video/diffusers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py index 0cd951aa62..920572b61f 100644 --- a/xinference/model/video/diffusers.py +++ b/xinference/model/video/diffusers.py @@ -172,9 +172,7 @@ def text_to_video( def _gen_base64_video(_video_url): try: with open(_video_url, "rb") as f: - buffered = BytesIO() - buffered.write(f.read()) - return base64.b64encode(buffered.getvalue()).decode() + return base64.b64encode(f.read()).decode() finally: os.remove(_video_url) From 8bb906034277e150c1ee75de8f256caa9428fed7 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Fri, 9 Aug 2024 10:10:09 +0200 Subject: [PATCH 14/15] Fix --- xinference/model/video/diffusers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py index 920572b61f..8786d36c1a 100644 --- a/xinference/model/video/diffusers.py +++ b/xinference/model/video/diffusers.py @@ -41,7 +41,7 @@ def export_to_video_imageio( video_frames: Union[List[np.ndarray], List["PIL.Image.Image"]], - output_video_path: Optional[str] = None, + output_video_path: str, fps: int = 8, ) -> str: """ @@ -49,8 +49,6 @@ def export_to_video_imageio( """ import imageio - if output_video_path is None: - output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name if isinstance(video_frames[0], PIL.Image.Image): video_frames = [np.array(frame) for frame in video_frames] with imageio.get_writer(output_video_path, fps=fps) as writer: From cd90792bd8c500c4285f6b401490a980a2db611c Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Fri, 9 Aug 2024 10:11:47 +0200 Subject: [PATCH 15/15] Fix lint --- xinference/model/video/diffusers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xinference/model/video/diffusers.py b/xinference/model/video/diffusers.py index 8786d36c1a..b9b8569918 100644 --- a/xinference/model/video/diffusers.py +++ b/xinference/model/video/diffusers.py @@ -16,13 +16,11 @@ import logging import os import sys -import tempfile import time import uuid from concurrent.futures import ThreadPoolExecutor from functools import partial -from io import BytesIO -from typing import TYPE_CHECKING, List, Optional, Union +from typing import TYPE_CHECKING, List, Union import numpy as np import PIL.Image