From 7db935a1414e35520b6bb87222e28b2f52d632fb Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 28 Feb 2024 10:21:28 +0530 Subject: [PATCH 01/28] fix kwarg in the SDXL LoRA DreamBooth (#7124) * fix kwarg * Empty-Commit --- examples/dreambooth/train_dreambooth_lora_sdxl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py index e42311fb8f9e..85effa19b59d 100644 --- a/examples/dreambooth/train_dreambooth_lora_sdxl.py +++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py @@ -1764,7 +1764,7 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): accelerator, pipeline_args, epoch, - final_validation=True, + is_final_validation=True, ) if args.push_to_hub: From e6cf21906d2df37aade90aa344287353e2dd04d2 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 28 Feb 2024 22:15:04 +0530 Subject: [PATCH 02/28] [Diffusers CI] Switch slow test runners (#7123) update --- .github/workflows/push_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index d64e24ed4209..113b89e6abb7 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -64,7 +64,7 @@ jobs: max-parallel: 1 matrix: module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }} - runs-on: docker-gpu + runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 From 2cad1a84652d3b279eb88437665f0226faa38bc6 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Wed, 28 Feb 2024 08:04:28 -1000 Subject: [PATCH 03/28] [stalebot] don't close the issue if the stale label is removed (#7106) * fix * fix * remove stalebot's ability to close issues --------- Co-authored-by: yiyixuxu --- utils/stale.py | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/utils/stale.py b/utils/stale.py index 9a29254e6257..30dddd4475d8 100644 --- a/utils/stale.py +++ b/utils/stale.py @@ -39,29 +39,18 @@ def main(): open_issues = repo.get_issues(state="open") for issue in open_issues: - comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True) - last_comment = comments[0] if len(comments) > 0 else None - if ( - last_comment is not None - and last_comment.user.login == "github-actions[bot]" - and (dt.now(timezone.utc) - issue.updated_at).days > 7 - and (dt.now(timezone.utc) - issue.created_at).days >= 30 - and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels()) - ): - # Closes the issue after 7 days of inactivity since the Stalebot notification. - issue.edit(state="closed") - elif ( - "stale" in issue.get_labels() - and last_comment is not None - and last_comment.user.login != "github-actions[bot]" - ): - # Opens the issue if someone other than Stalebot commented. - issue.edit(state="open") - issue.remove_from_labels("stale") + labels = [label.name for label in issue.get_labels()] + if "stale" in labels: + comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True) + last_comment = comments[0] if len(comments) > 0 else None + if last_comment is not None and last_comment.user.login != "github-actions[bot]": + # Opens the issue if someone other than Stalebot commented. + issue.edit(state="open") + issue.remove_from_labels("stale") elif ( (dt.now(timezone.utc) - issue.updated_at).days > 23 and (dt.now(timezone.utc) - issue.created_at).days >= 30 - and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels()) + and not any(label.name.lower() in LABELS_TO_EXEMPT for label in labels) ): # Post a Stalebot notification after 23 days of inactivity. issue.create_comment( From fa633ed6dec10bf23f3743088d1740417c9ef3ab Mon Sep 17 00:00:00 2001 From: elucida <1095429904@qq.com> Date: Thu, 29 Feb 2024 03:26:39 +0800 Subject: [PATCH 04/28] refactor: move model helper function in pipeline to a mixin class (#6571) * move model helper function in pipeline to EfficiencyMixin --------- Co-authored-by: YiYi Xu Co-authored-by: Sayak Paul --- ...p_guided_images_mixing_stable_diffusion.py | 14 +- .../community/clip_guided_stable_diffusion.py | 14 +- .../clip_guided_stable_diffusion_img2img.py | 14 +- .../community/composable_stable_diffusion.py | 61 +------ examples/community/gluegen.py | 58 +------ examples/community/imagic_stable_diffusion.py | 28 +-- examples/community/img2img_inpainting.py | 27 --- examples/community/instaflow_one_step.py | 35 +--- .../community/interpolate_stable_diffusion.py | 31 +--- examples/community/ip_adapter_face_id.py | 126 +------------- .../latent_consistency_interpolate.py | 65 +------ examples/community/llm_grounded_diffusion.py | 69 +------- examples/community/lpw_stable_diffusion.py | 111 +----------- examples/community/lpw_stable_diffusion_xl.py | 131 +-------------- examples/community/mixture_canvas.py | 4 +- .../multilingual_stable_diffusion.py | 31 +--- .../pipeline_animatediff_controlnet.py | 67 +------- .../pipeline_animatediff_img2video.py | 69 +------- .../community/pipeline_demofusion_sdxl.py | 39 +---- .../community/pipeline_sdxl_style_aligned.py | 123 +------------- ..._stable_diffusion_xl_controlnet_adapter.py | 69 +------- ...diffusion_xl_controlnet_adapter_inpaint.py | 66 +------- .../pipeline_stable_diffusion_xl_ipex.py | 61 ------- examples/community/pipeline_zero1to3.py | 111 +----------- examples/community/sd_text2img_k_diffusion.py | 68 +------- .../community/seed_resize_stable_diffusion.py | 30 +--- .../community/speech_to_image_diffusion.py | 11 +- .../community/stable_diffusion_comparison.py | 28 +-- .../stable_diffusion_controlnet_img2img.py | 90 +--------- .../stable_diffusion_controlnet_inpaint.py | 90 +--------- ...le_diffusion_controlnet_inpaint_img2img.py | 90 +--------- examples/community/stable_diffusion_ipex.py | 111 +----------- examples/community/stable_diffusion_mega.py | 30 +--- .../community/stable_diffusion_repaint.py | 81 +-------- examples/community/text_inpainting.py | 68 +------- .../community/unclip_image_interpolation.py | 46 +---- .../community/unclip_text_interpolation.py | 47 +----- .../community/wildcard_stable_diffusion.py | 4 +- .../controlnetxs/pipeline_controlnet_xs.py | 65 +------ .../pipeline_controlnet_xs_sd_xl.py | 69 +------- .../research_projects/rdm/pipeline_rdm.py | 122 +------------- src/diffusers/__init__.py | 2 + src/diffusers/models/attention_processor.py | 15 +- .../models/unets/unet_3d_condition.py | 39 +++++ src/diffusers/models/unets/unet_i2vgen_xl.py | 38 +++++ .../models/unets/unet_motion_model.py | 39 +++++ src/diffusers/pipelines/__init__.py | 2 + .../animatediff/pipeline_animatediff.py | 69 +------- .../pipeline_animatediff_video2video.py | 70 +------- .../pipelines/audioldm/pipeline_audioldm.py | 20 +-- .../pipelines/audioldm2/pipeline_audioldm2.py | 4 +- .../controlnet/pipeline_controlnet.py | 70 +------- .../controlnet/pipeline_controlnet_img2img.py | 70 +------- .../controlnet/pipeline_controlnet_inpaint.py | 70 +------- .../pipeline_controlnet_inpaint_sd_xl.py | 65 +------ .../controlnet/pipeline_controlnet_sd_xl.py | 64 +------ .../pipeline_controlnet_sd_xl_img2img.py | 69 +------- .../alt_diffusion/pipeline_alt_diffusion.py | 124 +------------- .../pipeline_alt_diffusion_img2img.py | 95 +---------- ...pipeline_stable_diffusion_model_editing.py | 22 +-- .../pipeline_stable_diffusion_paradigms.py | 37 +--- .../pipeline_stable_diffusion_pix2pix_zero.py | 4 +- .../pipelines/i2vgen_xl/pipeline_i2vgen_xl.py | 68 +------- .../pipeline_latent_consistency_img2img.py | 70 +------- .../pipeline_latent_consistency_text2img.py | 70 +------- .../pipelines/musicldm/pipeline_musicldm.py | 20 +-- .../pipeline_paint_by_example.py | 4 +- src/diffusers/pipelines/pia/pipeline_pia.py | 71 +------- src/diffusers/pipelines/pipeline_utils.py | 122 ++++++++++++++ .../pipeline_semantic_stable_diffusion.py | 4 +- .../pipeline_stable_diffusion.py | 126 +------------- ...peline_stable_diffusion_image_variation.py | 32 +--- .../pipeline_stable_diffusion_img2img.py | 132 +-------------- .../pipeline_stable_diffusion_inpaint.py | 132 +-------------- ...eline_stable_diffusion_instruct_pix2pix.py | 32 +--- ...ipeline_stable_diffusion_latent_upscale.py | 32 +--- .../pipeline_stable_diffusion_upscale.py | 32 +--- .../pipeline_stable_unclip.py | 20 +-- .../pipeline_stable_unclip_img2img.py | 22 +-- ...line_stable_diffusion_attend_and_excite.py | 20 +-- .../pipeline_stable_diffusion_diffedit.py | 39 +---- .../pipeline_stable_diffusion_gligen.py | 33 +--- ...line_stable_diffusion_gligen_text_image.py | 33 +--- .../pipeline_stable_diffusion_k_diffusion.py | 6 +- ...ipeline_stable_diffusion_xl_k_diffusion.py | 124 +------------- .../pipeline_stable_diffusion_ldm3d.py | 42 +---- .../pipeline_stable_diffusion_panorama.py | 22 +-- .../pipeline_stable_diffusion_safe.py | 4 +- .../pipeline_stable_diffusion_sag.py | 20 +-- .../pipeline_stable_diffusion_xl.py | 123 +------------- .../pipeline_stable_diffusion_xl_img2img.py | 126 +------------- .../pipeline_stable_diffusion_xl_inpaint.py | 126 +------------- ...ne_stable_diffusion_xl_instruct_pix2pix.py | 68 +------- .../pipeline_stable_diffusion_adapter.py | 48 +----- .../pipeline_stable_diffusion_xl_adapter.py | 64 +------ .../pipeline_text_to_video_synth.py | 65 +------ .../pipeline_text_to_video_synth_img2img.py | 65 +------ .../pipeline_text_to_video_zero.py | 4 +- .../pipeline_text_to_video_zero_sdxl.py | 19 +-- .../unidiffuser/pipeline_unidiffuser.py | 66 ++++---- src/diffusers/utils/dummy_pt_objects.py | 15 ++ .../pipelines/animatediff/test_animatediff.py | 6 +- tests/pipelines/i2vgen_xl/test_i2vgenxl.py | 4 +- .../test_stable_diffusion.py | 13 +- .../test_stable_diffusion_xl.py | 33 +--- tests/pipelines/test_pipelines_common.py | 159 ++++++++++++++++++ .../test_text_to_video.py | 4 +- 107 files changed, 798 insertions(+), 5004 deletions(-) diff --git a/examples/community/clip_guided_images_mixing_stable_diffusion.py b/examples/community/clip_guided_images_mixing_stable_diffusion.py index 399f5b14506d..16dcecd7b22a 100644 --- a/examples/community/clip_guided_images_mixing_stable_diffusion.py +++ b/examples/community/clip_guided_images_mixing_stable_diffusion.py @@ -12,12 +12,12 @@ from diffusers import ( AutoencoderKL, DDIMScheduler, - DiffusionPipeline, DPMSolverMultistepScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel, ) +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput from diffusers.utils import PIL_INTERPOLATION from diffusers.utils.torch_utils import randn_tensor @@ -77,7 +77,7 @@ def set_requires_grad(model, value): param.requires_grad = value -class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline): +class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline, StableDiffusionMixin): def __init__( self, vae: AutoencoderKL, @@ -113,16 +113,6 @@ def __init__( set_requires_grad(self.text_encoder, False) set_requires_grad(self.clip_model, False) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - self.enable_attention_slicing(None) - def freeze_vae(self): set_requires_grad(self.vae, False) diff --git a/examples/community/clip_guided_stable_diffusion.py b/examples/community/clip_guided_stable_diffusion.py index 3f4ab2ab9f4a..4205718802de 100644 --- a/examples/community/clip_guided_stable_diffusion.py +++ b/examples/community/clip_guided_stable_diffusion.py @@ -10,12 +10,12 @@ from diffusers import ( AutoencoderKL, DDIMScheduler, - DiffusionPipeline, DPMSolverMultistepScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel, ) +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput @@ -51,7 +51,7 @@ def set_requires_grad(model, value): param.requires_grad = value -class CLIPGuidedStableDiffusion(DiffusionPipeline): +class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin): """CLIP guided stable diffusion based on the amazing repo by @crowsonkb and @Jack000 - https://github.com/Jack000/glid-3-xl - https://github.dev/crowsonkb/k-diffusion @@ -89,16 +89,6 @@ def __init__( set_requires_grad(self.text_encoder, False) set_requires_grad(self.clip_model, False) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - self.enable_attention_slicing(None) - def freeze_vae(self): set_requires_grad(self.vae, False) diff --git a/examples/community/clip_guided_stable_diffusion_img2img.py b/examples/community/clip_guided_stable_diffusion_img2img.py index 2dbc9bef9ffe..434d5253679a 100644 --- a/examples/community/clip_guided_stable_diffusion_img2img.py +++ b/examples/community/clip_guided_stable_diffusion_img2img.py @@ -12,12 +12,12 @@ from diffusers import ( AutoencoderKL, DDIMScheduler, - DiffusionPipeline, DPMSolverMultistepScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel, ) +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput from diffusers.utils import PIL_INTERPOLATION, deprecate from diffusers.utils.torch_utils import randn_tensor @@ -125,7 +125,7 @@ def set_requires_grad(model, value): param.requires_grad = value -class CLIPGuidedStableDiffusion(DiffusionPipeline): +class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin): """CLIP guided stable diffusion based on the amazing repo by @crowsonkb and @Jack000 - https://github.com/Jack000/glid-3-xl - https://github.dev/crowsonkb/k-diffusion @@ -163,16 +163,6 @@ def __init__( set_requires_grad(self.text_encoder, False) set_requires_grad(self.clip_model, False) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - self.enable_attention_slicing(None) - def freeze_vae(self): set_requires_grad(self.vae, False) diff --git a/examples/community/composable_stable_diffusion.py b/examples/community/composable_stable_diffusion.py index 2693ae45afac..3153bd30e479 100644 --- a/examples/community/composable_stable_diffusion.py +++ b/examples/community/composable_stable_diffusion.py @@ -22,6 +22,7 @@ from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import ( @@ -32,13 +33,13 @@ LMSDiscreteScheduler, PNDMScheduler, ) -from diffusers.utils import deprecate, is_accelerate_available, logging +from diffusers.utils import deprecate, logging logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class ComposableStableDiffusionPipeline(DiffusionPipeline): +class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -164,62 +165,6 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. - - When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several - steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_sequential_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, - text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a - `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. - """ - if is_accelerate_available(): - from accelerate import cpu_offload - else: - raise ImportError("Please install accelerate via `pip install accelerate`") - - device = torch.device(f"cuda:{gpu_id}") - - for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: - if cpu_offloaded_model is not None: - cpu_offload(cpu_offloaded_model, device) - - if self.safety_checker is not None: - # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate - # fix by only offloading self.safety_checker for now - cpu_offload(self.safety_checker.vision_model, device) - - @property - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"): - return self.device - for module in self.unet.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt): r""" Encodes the prompt into text encoder hidden states. diff --git a/examples/community/gluegen.py b/examples/community/gluegen.py index ecfe91eb9483..b8f147000229 100644 --- a/examples/community/gluegen.py +++ b/examples/community/gluegen.py @@ -10,6 +10,7 @@ from diffusers.loaders import LoraLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.models.lora import adjust_lora_scale_text_encoder +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers @@ -193,7 +194,7 @@ def retrieve_timesteps( return timesteps, num_inference_steps -class GlueGenStableDiffusionPipeline(DiffusionPipeline, LoraLoaderMixin): +class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, LoraLoaderMixin): def __init__( self, vae: AutoencoderKL, @@ -241,35 +242,6 @@ def load_language_adapter( ) self.language_adapter.load_state_dict(torch.load(model_path)) - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - def _adapt_language(self, prompt_embeds: torch.FloatTensor): prompt_embeds = prompt_embeds / 3 prompt_embeds = self.language_adapter(prompt_embeds) * (self.tensor_norm / 2) @@ -544,32 +516,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype latents = latents * self.scheduler.init_noise_sigma return latents - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index 98e06ceb21ac..25048e946fe0 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -19,6 +19,7 @@ from diffusers import DiffusionPipeline from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler @@ -56,7 +57,7 @@ def preprocess(image): return 2.0 * image - 1.0 -class ImagicStableDiffusionPipeline(DiffusionPipeline): +class ImagicStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for imagic image editing. See paper here: https://arxiv.org/pdf/2210.09276.pdf @@ -105,31 +106,6 @@ def __init__( feature_extractor=feature_extractor, ) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, - `attention_head_dim` must be a multiple of `slice_size`. - """ - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `attention slicing` - self.enable_attention_slicing(None) - def train( self, prompt: Union[str, List[str]], diff --git a/examples/community/img2img_inpainting.py b/examples/community/img2img_inpainting.py index 8ee8355d49a6..71dc3cf712ed 100644 --- a/examples/community/img2img_inpainting.py +++ b/examples/community/img2img_inpainting.py @@ -129,33 +129,6 @@ def __init__( feature_extractor=feature_extractor, ) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, - `attention_head_dim` must be a multiple of `slice_size`. - """ - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `attention slicing` - self.enable_attention_slicing(None) - @torch.no_grad() def __call__( self, diff --git a/examples/community/instaflow_one_step.py b/examples/community/instaflow_one_step.py index 065abfe13d23..b07d85f8fcdf 100644 --- a/examples/community/instaflow_one_step.py +++ b/examples/community/instaflow_one_step.py @@ -24,7 +24,7 @@ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.models.lora import adjust_lora_scale_text_encoder -from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers @@ -52,7 +52,9 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): return noise_cfg -class InstaFlowPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin): +class InstaFlowPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin +): r""" Pipeline for text-to-image generation using Rectified Flow and Euler discretization. This customized pipeline is based on StableDiffusionPipeline from the official Diffusers library (0.21.4) @@ -180,35 +182,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - def _encode_prompt( self, prompt, diff --git a/examples/community/interpolate_stable_diffusion.py b/examples/community/interpolate_stable_diffusion.py index 70e4d025a037..1b859c35f174 100644 --- a/examples/community/interpolate_stable_diffusion.py +++ b/examples/community/interpolate_stable_diffusion.py @@ -7,9 +7,9 @@ import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler @@ -46,7 +46,7 @@ def slerp(t, v0, v1, DOT_THRESHOLD=0.9995): return v2 -class StableDiffusionWalkPipeline(DiffusionPipeline): +class StableDiffusionWalkPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -120,33 +120,6 @@ def __init__( feature_extractor=feature_extractor, ) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, - `attention_head_dim` must be a multiple of `slice_size`. - """ - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `attention slicing` - self.enable_attention_slicing(None) - @torch.no_grad() def __call__( self, diff --git a/examples/community/ip_adapter_face_id.py b/examples/community/ip_adapter_face_id.py index dfd6a9df6eb1..b4d2446b5ce9 100644 --- a/examples/community/ip_adapter_face_id.py +++ b/examples/community/ip_adapter_face_id.py @@ -26,9 +26,8 @@ from diffusers.image_processor import VaeImageProcessor from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.models.attention_processor import FusedAttnProcessor2_0 from diffusers.models.lora import LoRALinearLayer, adjust_lora_scale_text_encoder -from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers @@ -415,7 +414,12 @@ def retrieve_timesteps( class IPAdapterFaceIDStableDiffusionPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + LoraLoaderMixin, + IPAdapterMixin, + FromSingleFileMixin, ): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -727,35 +731,6 @@ def set_ip_adapter_scale(self, scale): if isinstance(attn_processor, (LoRAIPAdapterAttnProcessor, LoRAIPAdapterAttnProcessor2_0)): attn_processor.scale = scale - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - def _encode_prompt( self, prompt, @@ -1080,93 +1055,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype latents = latents * self.scheduler.init_noise_sigma return latents - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections - def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - """ - self.fusing_unet = False - self.fusing_vae = False - - if unet: - self.fusing_unet = True - self.unet.fuse_qkv_projections() - self.unet.set_attn_processor(FusedAttnProcessor2_0()) - - if vae: - if not isinstance(self.vae, AutoencoderKL): - raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") - - self.fusing_vae = True - self.vae.fuse_qkv_projections() - self.vae.set_attn_processor(FusedAttnProcessor2_0()) - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections - def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """Disable QKV projection fusion if enabled. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - - """ - if unet: - if not self.fusing_unet: - logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") - else: - self.unet.unfuse_qkv_projections() - self.fusing_unet = False - - if vae: - if not self.fusing_vae: - logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") - else: - self.vae.unfuse_qkv_projections() - self.fusing_vae = False - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/examples/community/latent_consistency_interpolate.py b/examples/community/latent_consistency_interpolate.py index 7b9e4806bf44..0c14a55bd30f 100644 --- a/examples/community/latent_consistency_interpolate.py +++ b/examples/community/latent_consistency_interpolate.py @@ -9,7 +9,7 @@ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.models.lora import adjust_lora_scale_text_encoder -from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker from diffusers.schedulers import LCMScheduler from diffusers.utils import ( @@ -190,7 +190,7 @@ def slerp( class LatentConsistencyModelWalkPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin ): r""" Pipeline for text-to-image generation using a latent consistency model. @@ -273,67 +273,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt def encode_prompt( self, diff --git a/examples/community/llm_grounded_diffusion.py b/examples/community/llm_grounded_diffusion.py index d815b4ea8e42..5db144a9a23a 100644 --- a/examples/community/llm_grounded_diffusion.py +++ b/examples/community/llm_grounded_diffusion.py @@ -35,6 +35,7 @@ from diffusers.models.attention_processor import AttnProcessor2_0 from diffusers.models.lora import adjust_lora_scale_text_encoder from diffusers.pipelines import DiffusionPipeline +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers @@ -267,7 +268,12 @@ def __call__( class LLMGroundedDiffusionPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + LoraLoaderMixin, + IPAdapterMixin, + FromSingleFileMixin, ): r""" Pipeline for layout-grounded text-to-image generation using LLM-grounded Diffusion (LMD+): https://arxiv.org/pdf/2305.13655.pdf. @@ -1180,39 +1186,6 @@ def latent_lmd_guidance( # Below are methods copied from StableDiffusionPipeline # The design choice of not inheriting from StableDiffusionPipeline is discussed here: https://github.com/huggingface/diffusers/pull/5993#issuecomment-1834258517 - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, @@ -1522,34 +1495,6 @@ def prepare_latents( latents = latents * self.scheduler.init_noise_sigma return latents - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py index 7249e033186f..78d93bfb7081 100644 --- a/examples/community/lpw_stable_diffusion.py +++ b/examples/community/lpw_stable_diffusion.py @@ -13,13 +13,12 @@ from diffusers.image_processor import VaeImageProcessor from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( PIL_INTERPOLATION, deprecate, - is_accelerate_available, - is_accelerate_version, logging, ) from diffusers.utils.torch_utils import randn_tensor @@ -410,7 +409,7 @@ def preprocess_mask(mask, batch_size, scale_factor=8): class StableDiffusionLongPromptWeightingPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin ): r""" Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing @@ -534,112 +533,6 @@ def __init__( requires_safety_checker=requires_safety_checker, ) - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. - - When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several - steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. - - When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in - several steps. This is useful to save a large amount of memory and to allow the processing of larger images. - """ - self.vae.enable_tiling() - - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload - def enable_sequential_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, - text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a - `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. - Note that offloading happens on a submodule basis. Memory savings are higher than with - `enable_model_cpu_offload`, but performance is lower. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): - from accelerate import cpu_offload - else: - raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") - - device = torch.device(f"cuda:{gpu_id}") - - if self.device.type != "cpu": - self.to("cpu", silence_dtype_warnings=True) - torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) - - for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: - cpu_offload(cpu_offloaded_model, device) - - if self.safety_checker is not None: - cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload - def enable_model_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared - to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` - method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with - `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): - from accelerate import cpu_offload_with_hook - else: - raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") - - device = torch.device(f"cuda:{gpu_id}") - - if self.device.type != "cpu": - self.to("cpu", silence_dtype_warnings=True) - torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) - - hook = None - for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: - _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) - - if self.safety_checker is not None: - _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) - - # We'll offload the last model manually. - self.final_offload_hook = hook - - @property - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if not hasattr(self.unet, "_hf_hook"): - return self.device - for module in self.unet.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - def _encode_prompt( self, prompt, diff --git a/examples/community/lpw_stable_diffusion_xl.py b/examples/community/lpw_stable_diffusion_xl.py index 4b63fa7019a7..b0abbba9a32b 100644 --- a/examples/community/lpw_stable_diffusion_xl.py +++ b/examples/community/lpw_stable_diffusion_xl.py @@ -26,11 +26,11 @@ from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel from diffusers.models.attention_processor import ( AttnProcessor2_0, - FusedAttnProcessor2_0, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor, ) +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( @@ -545,7 +545,12 @@ def retrieve_timesteps( class SDXLLongPromptWeightingPipeline( - DiffusionPipeline, FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin + DiffusionPipeline, + StableDiffusionMixin, + FromSingleFileMixin, + IPAdapterMixin, + LoraLoaderMixin, + TextualInversionLoaderMixin, ): r""" Pipeline for text-to-image generation using Stable Diffusion XL. @@ -649,39 +654,6 @@ def __init__( else: self.watermark = None - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - def enable_model_cpu_offload(self, gpu_id=0): r""" Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared @@ -1030,95 +1002,6 @@ def check_inputs( "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." ) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections - def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - """ - self.fusing_unet = False - self.fusing_vae = False - - if unet: - self.fusing_unet = True - self.unet.fuse_qkv_projections() - self.unet.set_attn_processor(FusedAttnProcessor2_0()) - - if vae: - if not isinstance(self.vae, AutoencoderKL): - raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") - - self.fusing_vae = True - self.vae.fuse_qkv_projections() - self.vae.set_attn_processor(FusedAttnProcessor2_0()) - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections - def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """Disable QKV projection fusion if enabled. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - - """ - if unet: - if not self.fusing_unet: - logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") - else: - self.unet.unfuse_qkv_projections() - self.fusing_unet = False - - if vae: - if not self.fusing_vae: - logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") - else: - self.vae.unfuse_qkv_projections() - self.fusing_vae = False - def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None): # get the original timestep using init_timestep if denoising_start is None: diff --git a/examples/community/mixture_canvas.py b/examples/community/mixture_canvas.py index 3737183e5513..2083c7acad38 100644 --- a/examples/community/mixture_canvas.py +++ b/examples/community/mixture_canvas.py @@ -12,7 +12,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler @@ -264,7 +264,7 @@ def _quartic_weights(self, region: DiffusionRegion) -> torch.tensor: return torch.tile(torch.tensor(weights), (self.nbatch, self.latent_space_dim, 1, 1)) -class StableDiffusionCanvasPipeline(DiffusionPipeline): +class StableDiffusionCanvasPipeline(DiffusionPipeline, StableDiffusionMixin): """Stable Diffusion pipeline that mixes several diffusers in the same canvas""" def __init__( diff --git a/examples/community/multilingual_stable_diffusion.py b/examples/community/multilingual_stable_diffusion.py index 7597efd215af..f3b0540cf4d3 100644 --- a/examples/community/multilingual_stable_diffusion.py +++ b/examples/community/multilingual_stable_diffusion.py @@ -11,9 +11,9 @@ pipeline, ) -from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler @@ -48,7 +48,7 @@ def translate_prompt(prompt, translation_tokenizer, translation_model, device): return en_trans[0] -class MultilingualStableDiffusion(DiffusionPipeline): +class MultilingualStableDiffusion(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text-to-image generation using Stable Diffusion in different languages. @@ -135,33 +135,6 @@ def __init__( feature_extractor=feature_extractor, ) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, - `attention_head_dim` must be a multiple of `slice_size`. - """ - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `attention slicing` - self.enable_attention_slicing(None) - @torch.no_grad() def __call__( self, diff --git a/examples/community/pipeline_animatediff_controlnet.py b/examples/community/pipeline_animatediff_controlnet.py index 5873ceaa8d70..44fc1bae6f7f 100644 --- a/examples/community/pipeline_animatediff_controlnet.py +++ b/examples/community/pipeline_animatediff_controlnet.py @@ -28,7 +28,7 @@ from diffusers.models.lora import adjust_lora_scale_text_encoder from diffusers.models.unets.unet_motion_model import MotionAdapter from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel -from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.schedulers import ( DDIMScheduler, DPMSolverMultistepScheduler, @@ -111,7 +111,9 @@ class AnimateDiffControlNetPipelineOutput(BaseOutput): frames: Union[torch.Tensor, np.ndarray] -class AnimateDiffControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin): +class AnimateDiffControlNetPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin +): r""" Pipeline for text-to-video generation. @@ -441,67 +443,6 @@ def decode_latents(self, latents): video = video.float() return video - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature diff --git a/examples/community/pipeline_animatediff_img2video.py b/examples/community/pipeline_animatediff_img2video.py index e77e26592d3e..982bed08140b 100644 --- a/examples/community/pipeline_animatediff_img2video.py +++ b/examples/community/pipeline_animatediff_img2video.py @@ -30,9 +30,9 @@ from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel from diffusers.models.lora import adjust_lora_scale_text_encoder -from diffusers.models.unets.unet_motion_model import MotionAdapter +from diffusers.models.unet_motion_model import MotionAdapter from diffusers.pipelines.animatediff.pipeline_output import AnimateDiffPipelineOutput -from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.schedulers import ( DDIMScheduler, DPMSolverMultistepScheduler, @@ -232,7 +232,9 @@ def retrieve_timesteps( return timesteps, num_inference_steps -class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin): +class AnimateDiffImgToVideoPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin +): r""" Pipeline for image-to-video generation. @@ -564,67 +566,6 @@ def decode_latents(self, latents): video = video.float() return video - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature diff --git a/examples/community/pipeline_demofusion_sdxl.py b/examples/community/pipeline_demofusion_sdxl.py index ab0d3cf9dd29..e29678b55922 100644 --- a/examples/community/pipeline_demofusion_sdxl.py +++ b/examples/community/pipeline_demofusion_sdxl.py @@ -23,7 +23,7 @@ XFormersAttnProcessor, ) from diffusers.models.lora import adjust_lora_scale_text_encoder -from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( is_accelerate_available, @@ -93,7 +93,9 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): return noise_cfg -class DemoFusionSDXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin): +class DemoFusionSDXLPipeline( + DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +): r""" Pipeline for text-to-image generation using Stable Diffusion XL. @@ -176,39 +178,6 @@ def __init__( else: self.watermark = None - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - def encode_prompt( self, prompt: str, diff --git a/examples/community/pipeline_sdxl_style_aligned.py b/examples/community/pipeline_sdxl_style_aligned.py index a9509346dae6..ec4aa3791557 100644 --- a/examples/community/pipeline_sdxl_style_aligned.py +++ b/examples/community/pipeline_sdxl_style_aligned.py @@ -51,7 +51,7 @@ XFormersAttnProcessor, ) from diffusers.models.lora import adjust_lora_scale_text_encoder -from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( @@ -389,6 +389,7 @@ def retrieve_latents( class StyleAlignedSDXLPipeline( DiffusionPipeline, + StableDiffusionMixin, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin, @@ -504,39 +505,6 @@ def __init__( else: self.watermark = None - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - def encode_prompt( self, prompt: str, @@ -1187,34 +1155,6 @@ def upcast_vae(self): self.vae.decoder.conv_in.to(dtype) self.vae.decoder.mid_block.to(dtype) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - def _enable_shared_attention_processors( self, share_attention: bool, @@ -1361,65 +1301,6 @@ def disable_style_aligned(self): self._style_aligned_norm_layers = None self._disable_shared_attention_processors() - def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - """ - self.fusing_unet = False - self.fusing_vae = False - - if unet: - self.fusing_unet = True - self.unet.fuse_qkv_projections() - self.unet.set_attn_processor(FusedAttnProcessor2_0()) - - if vae: - if not isinstance(self.vae, AutoencoderKL): - raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") - - self.fusing_vae = True - self.vae.fuse_qkv_projections() - self.vae.set_attn_processor(FusedAttnProcessor2_0()) - - def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """Disable QKV projection fusion if enabled. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - - """ - if unet: - if not self.fusing_unet: - logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") - else: - self.unet.unfuse_qkv_projections() - self.fusing_unet = False - - if vae: - if not self.fusing_vae: - logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") - else: - self.vae.unfuse_qkv_projections() - self.fusing_vae = False - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py index e1437bee7a15..fe94646a4436 100644 --- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py +++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py @@ -33,7 +33,7 @@ ) from diffusers.models.lora import adjust_lora_scale_text_encoder from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel -from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( @@ -158,7 +158,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): class StableDiffusionXLControlNetAdapterPipeline( - DiffusionPipeline, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin + DiffusionPipeline, + StableDiffusionMixin, + FromSingleFileMixin, + StableDiffusionXLLoraLoaderMixin, + TextualInversionLoaderMixin, ): r""" Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter @@ -234,39 +238,6 @@ def __init__( ) self.default_sample_size = self.unet.config.sample_size - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt def encode_prompt( self, @@ -863,34 +834,6 @@ def _default_height_width(self, height, width, image): return height, width - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - def prepare_control_image( self, image, diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py index c7967bbc12a2..ac645e0802d2 100644 --- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py +++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py @@ -52,6 +52,7 @@ ) from diffusers.models.lora import adjust_lora_scale_text_encoder from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( @@ -303,7 +304,9 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): return noise_cfg -class StableDiffusionXLControlNetAdapterInpaintPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin): +class StableDiffusionXLControlNetAdapterInpaintPipeline( + DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, LoraLoaderMixin +): r""" Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter https://arxiv.org/abs/2302.08453 @@ -383,39 +386,6 @@ def __init__( ) self.default_sample_size = self.unet.config.sample_size - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt def encode_prompt( self, @@ -1207,34 +1177,6 @@ def _default_height_width(self, height, width, image): return height, width - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - def prepare_control_image( self, image, diff --git a/examples/community/pipeline_stable_diffusion_xl_ipex.py b/examples/community/pipeline_stable_diffusion_xl_ipex.py index c57d58bb58ba..68ad5dbec77d 100644 --- a/examples/community/pipeline_stable_diffusion_xl_ipex.py +++ b/examples/community/pipeline_stable_diffusion_xl_ipex.py @@ -267,39 +267,6 @@ def __init__( else: self.watermark = None - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - def encode_prompt( self, prompt: str, @@ -701,34 +668,6 @@ def upcast_vae(self): self.vae.decoder.conv_in.to(dtype) self.vae.decoder.mid_block.to(dtype) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/examples/community/pipeline_zero1to3.py b/examples/community/pipeline_zero1to3.py index 600cf2dc1b63..133aa694c18c 100644 --- a/examples/community/pipeline_zero1to3.py +++ b/examples/community/pipeline_zero1to3.py @@ -22,18 +22,16 @@ # randn_tensor, # replace_example_docstring, # ) -# from ..pipeline_utils import DiffusionPipeline +# from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin # from . import StableDiffusionPipelineOutput # from .safety_checker import StableDiffusionSafetyChecker -from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel +from diffusers import AutoencoderKL, DiffusionPipeline, StableDiffusionMixin, UNet2DConditionModel from diffusers.configuration_utils import ConfigMixin, FrozenDict from diffusers.models.modeling_utils import ModelMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( deprecate, - is_accelerate_available, - is_accelerate_version, logging, replace_example_docstring, ) @@ -68,7 +66,7 @@ def forward(self, x): return self.projection(x) -class Zero1to3StableDiffusionPipeline(DiffusionPipeline): +class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for single view conditioned novel view generation using Zero1to3. @@ -187,109 +185,6 @@ def __init__( self.register_to_config(requires_safety_checker=requires_safety_checker) # self.model_mode = None - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. - - When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several - steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. - - When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in - several steps. This is useful to save a large amount of memory and to allow the processing of larger images. - """ - self.vae.enable_tiling() - - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - - def enable_sequential_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, - text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a - `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. - Note that offloading happens on a submodule basis. Memory savings are higher than with - `enable_model_cpu_offload`, but performance is lower. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): - from accelerate import cpu_offload - else: - raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") - - device = torch.device(f"cuda:{gpu_id}") - - if self.device.type != "cpu": - self.to("cpu", silence_dtype_warnings=True) - torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) - - for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: - cpu_offload(cpu_offloaded_model, device) - - if self.safety_checker is not None: - cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) - - def enable_model_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared - to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` - method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with - `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): - from accelerate import cpu_offload_with_hook - else: - raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.") - - device = torch.device(f"cuda:{gpu_id}") - - if self.device.type != "cpu": - self.to("cpu", silence_dtype_warnings=True) - torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) - - hook = None - for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: - _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) - - if self.safety_checker is not None: - _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) - - # We'll offload the last model manually. - self.final_offload_hook = hook - - @property - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if not hasattr(self.unet, "_hf_hook"): - return self.device - for module in self.unet.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - def _encode_prompt( self, prompt, diff --git a/examples/community/sd_text2img_k_diffusion.py b/examples/community/sd_text2img_k_diffusion.py index c6a4bf2ce613..3299a7605257 100755 --- a/examples/community/sd_text2img_k_diffusion.py +++ b/examples/community/sd_text2img_k_diffusion.py @@ -19,9 +19,9 @@ import torch from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser -from diffusers import DiffusionPipeline, LMSDiscreteScheduler +from diffusers import DiffusionPipeline, LMSDiscreteScheduler, StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput -from diffusers.utils import is_accelerate_available, logging +from diffusers.utils import logging logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -41,7 +41,7 @@ def apply_model(self, *args, **kwargs): return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample -class StableDiffusionPipeline(DiffusionPipeline): +class StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -120,68 +120,6 @@ def set_scheduler(self, scheduler_type: str): sampling = getattr(library, "sampling") self.sampler = getattr(sampling, scheduler_type) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, - `attention_head_dim` must be a multiple of `slice_size`. - """ - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `attention slicing` - self.enable_attention_slicing(None) - - def enable_sequential_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, - text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a - `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. - """ - if is_accelerate_available(): - from accelerate import cpu_offload - else: - raise ImportError("Please install accelerate via `pip install accelerate`") - - device = torch.device(f"cuda:{gpu_id}") - - for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]: - if cpu_offloaded_model is not None: - cpu_offload(cpu_offloaded_model, device) - - @property - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"): - return self.device - for module in self.unet.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt): r""" Encodes the prompt into text encoder hidden states. diff --git a/examples/community/seed_resize_stable_diffusion.py b/examples/community/seed_resize_stable_diffusion.py index 9318277b8f01..20f972f049b3 100644 --- a/examples/community/seed_resize_stable_diffusion.py +++ b/examples/community/seed_resize_stable_diffusion.py @@ -9,6 +9,7 @@ from diffusers import DiffusionPipeline from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler @@ -18,7 +19,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class SeedResizeStableDiffusionPipeline(DiffusionPipeline): +class SeedResizeStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -67,33 +68,6 @@ def __init__( feature_extractor=feature_extractor, ) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, - `attention_head_dim` must be a multiple of `slice_size`. - """ - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `attention slicing` - self.enable_attention_slicing(None) - @torch.no_grad() def __call__( self, diff --git a/examples/community/speech_to_image_diffusion.py b/examples/community/speech_to_image_diffusion.py index 63bcfb662517..3537ef89e1a1 100644 --- a/examples/community/speech_to_image_diffusion.py +++ b/examples/community/speech_to_image_diffusion.py @@ -18,6 +18,7 @@ PNDMScheduler, UNet2DConditionModel, ) +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.utils import logging @@ -26,7 +27,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class SpeechToImagePipeline(DiffusionPipeline): +class SpeechToImagePipeline(DiffusionPipeline, StableDiffusionMixin): def __init__( self, speech_model: WhisperForConditionalGeneration, @@ -62,14 +63,6 @@ def __init__( feature_extractor=feature_extractor, ) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - if slice_size == "auto": - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - self.enable_attention_slicing(None) - @torch.no_grad() def __call__( self, diff --git a/examples/community/stable_diffusion_comparison.py b/examples/community/stable_diffusion_comparison.py index 7997a0cc0186..dab5705b3370 100644 --- a/examples/community/stable_diffusion_comparison.py +++ b/examples/community/stable_diffusion_comparison.py @@ -12,6 +12,7 @@ StableDiffusionPipeline, UNet2DConditionModel, ) +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -22,7 +23,7 @@ pipe4_model_id = "CompVis/stable-diffusion-v1-4" -class StableDiffusionComparisonPipeline(DiffusionPipeline): +class StableDiffusionComparisonPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for parallel comparison of Stable Diffusion v1-v4 This pipeline inherits from DiffusionPipeline and depends on the use of an Auth Token for @@ -83,31 +84,6 @@ def __init__( def layers(self) -> Dict[str, Any]: return {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")} - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, - `attention_head_dim` must be a multiple of `slice_size`. - """ - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `attention slicing` - self.enable_attention_slicing(None) - @torch.no_grad() def text2img_sd1_1( self, diff --git a/examples/community/stable_diffusion_controlnet_img2img.py b/examples/community/stable_diffusion_controlnet_img2img.py index a2b92fff0fb5..5f9083616a84 100644 --- a/examples/community/stable_diffusion_controlnet_img2img.py +++ b/examples/community/stable_diffusion_controlnet_img2img.py @@ -8,14 +8,13 @@ import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from diffusers import AutoencoderKL, ControlNetModel, DiffusionPipeline, UNet2DConditionModel, logging +from diffusers import AutoencoderKL, ControlNetModel, UNet2DConditionModel, logging from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( PIL_INTERPOLATION, - is_accelerate_available, - is_accelerate_version, replace_example_docstring, ) from diffusers.utils.torch_utils import randn_tensor @@ -130,7 +129,7 @@ def prepare_controlnet_conditioning_image( return controlnet_conditioning_image -class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline): +class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin): """ Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/ """ @@ -183,89 +182,6 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. - - When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several - steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_sequential_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, - text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a - `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. - Note that offloading happens on a submodule basis. Memory savings are higher than with - `enable_model_cpu_offload`, but performance is lower. - """ - if is_accelerate_available(): - from accelerate import cpu_offload - else: - raise ImportError("Please install accelerate via `pip install accelerate`") - - device = torch.device(f"cuda:{gpu_id}") - - for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]: - cpu_offload(cpu_offloaded_model, device) - - if self.safety_checker is not None: - cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) - - def enable_model_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared - to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` - method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with - `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): - from accelerate import cpu_offload_with_hook - else: - raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") - - device = torch.device(f"cuda:{gpu_id}") - - hook = None - for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: - _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) - - if self.safety_checker is not None: - # the safety checker can offload the vae again - _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) - - # control net hook has be manually offloaded as it alternates with unet - cpu_offload_with_hook(self.controlnet, device) - - # We'll offload the last model manually. - self.final_offload_hook = hook - - @property - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if not hasattr(self.unet, "_hf_hook"): - return self.device - for module in self.unet.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - def _encode_prompt( self, prompt, diff --git a/examples/community/stable_diffusion_controlnet_inpaint.py b/examples/community/stable_diffusion_controlnet_inpaint.py index b87973366418..0173ed41bee6 100644 --- a/examples/community/stable_diffusion_controlnet_inpaint.py +++ b/examples/community/stable_diffusion_controlnet_inpaint.py @@ -9,14 +9,13 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from diffusers import AutoencoderKL, ControlNetModel, DiffusionPipeline, UNet2DConditionModel, logging +from diffusers import AutoencoderKL, ControlNetModel, UNet2DConditionModel, logging from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( PIL_INTERPOLATION, - is_accelerate_available, - is_accelerate_version, replace_example_docstring, ) from diffusers.utils.torch_utils import randn_tensor @@ -228,7 +227,7 @@ def prepare_controlnet_conditioning_image( return controlnet_conditioning_image -class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline): +class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, StableDiffusionMixin): """ Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/ """ @@ -282,89 +281,6 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. - - When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several - steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_sequential_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, - text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a - `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. - Note that offloading happens on a submodule basis. Memory savings are higher than with - `enable_model_cpu_offload`, but performance is lower. - """ - if is_accelerate_available(): - from accelerate import cpu_offload - else: - raise ImportError("Please install accelerate via `pip install accelerate`") - - device = torch.device(f"cuda:{gpu_id}") - - for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]: - cpu_offload(cpu_offloaded_model, device) - - if self.safety_checker is not None: - cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) - - def enable_model_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared - to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` - method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with - `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): - from accelerate import cpu_offload_with_hook - else: - raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") - - device = torch.device(f"cuda:{gpu_id}") - - hook = None - for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: - _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) - - if self.safety_checker is not None: - # the safety checker can offload the vae again - _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) - - # control net hook has be manually offloaded as it alternates with unet - cpu_offload_with_hook(self.controlnet, device) - - # We'll offload the last model manually. - self.final_offload_hook = hook - - @property - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if not hasattr(self.unet, "_hf_hook"): - return self.device - for module in self.unet.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - def _encode_prompt( self, prompt, diff --git a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py index 96ad3c39239d..d056eb112165 100644 --- a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py +++ b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py @@ -9,13 +9,12 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from diffusers import AutoencoderKL, ControlNetModel, DiffusionPipeline, UNet2DConditionModel, logging +from diffusers import AutoencoderKL, ControlNetModel, UNet2DConditionModel, logging +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( PIL_INTERPOLATION, - is_accelerate_available, - is_accelerate_version, replace_example_docstring, ) from diffusers.utils.torch_utils import randn_tensor @@ -217,7 +216,7 @@ def prepare_controlnet_conditioning_image( return controlnet_conditioning_image -class StableDiffusionControlNetInpaintImg2ImgPipeline(DiffusionPipeline): +class StableDiffusionControlNetInpaintImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin): """ Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/ """ @@ -267,89 +266,6 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. - - When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several - steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_sequential_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, - text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a - `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. - Note that offloading happens on a submodule basis. Memory savings are higher than with - `enable_model_cpu_offload`, but performance is lower. - """ - if is_accelerate_available(): - from accelerate import cpu_offload - else: - raise ImportError("Please install accelerate via `pip install accelerate`") - - device = torch.device(f"cuda:{gpu_id}") - - for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]: - cpu_offload(cpu_offloaded_model, device) - - if self.safety_checker is not None: - cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) - - def enable_model_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared - to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` - method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with - `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): - from accelerate import cpu_offload_with_hook - else: - raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") - - device = torch.device(f"cuda:{gpu_id}") - - hook = None - for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: - _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) - - if self.safety_checker is not None: - # the safety checker can offload the vae again - _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) - - # control net hook has be manually offloaded as it alternates with unet - cpu_offload_with_hook(self.controlnet, device) - - # We'll offload the last model manually. - self.final_offload_hook = hook - - @property - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if not hasattr(self.unet, "_hf_hook"): - return self.device - for module in self.unet.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - def _encode_prompt( self, prompt, diff --git a/examples/community/stable_diffusion_ipex.py b/examples/community/stable_diffusion_ipex.py index bf58cc8453a1..8e71f79e9ae4 100644 --- a/examples/community/stable_diffusion_ipex.py +++ b/examples/community/stable_diffusion_ipex.py @@ -23,14 +23,12 @@ from diffusers.configuration_utils import FrozenDict from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( deprecate, - is_accelerate_available, - is_accelerate_version, logging, replace_example_docstring, ) @@ -62,7 +60,9 @@ """ -class StableDiffusionIPEXPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableDiffusionIPEXPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin +): r""" Pipeline for text-to-image generation using Stable Diffusion on IPEX. @@ -304,109 +304,6 @@ def prepare_for_ipex(self, promt, dtype=torch.float32, height=None, width=None, ave_decoder_trace_model = torch.jit.freeze(ave_decoder_trace_model) self.vae.decoder.forward = ave_decoder_trace_model.forward - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. - - When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several - steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. - - When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in - several steps. This is useful to save a large amount of memory and to allow the processing of larger images. - """ - self.vae.enable_tiling() - - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - - def enable_sequential_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, - text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a - `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. - Note that offloading happens on a submodule basis. Memory savings are higher than with - `enable_model_cpu_offload`, but performance is lower. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): - from accelerate import cpu_offload - else: - raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") - - device = torch.device(f"cuda:{gpu_id}") - - if self.device.type != "cpu": - self.to("cpu", silence_dtype_warnings=True) - torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) - - for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: - cpu_offload(cpu_offloaded_model, device) - - if self.safety_checker is not None: - cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) - - def enable_model_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared - to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` - method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with - `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): - from accelerate import cpu_offload_with_hook - else: - raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.") - - device = torch.device(f"cuda:{gpu_id}") - - if self.device.type != "cpu": - self.to("cpu", silence_dtype_warnings=True) - torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) - - hook = None - for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: - _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) - - if self.safety_checker is not None: - _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) - - # We'll offload the last model manually. - self.final_offload_hook = hook - - @property - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if not hasattr(self.unet, "_hf_hook"): - return self.device - for module in self.unet.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - def _encode_prompt( self, prompt, diff --git a/examples/community/stable_diffusion_mega.py b/examples/community/stable_diffusion_mega.py index faed00b49d40..e53afb703e24 100644 --- a/examples/community/stable_diffusion_mega.py +++ b/examples/community/stable_diffusion_mega.py @@ -16,6 +16,7 @@ UNet2DConditionModel, ) from diffusers.configuration_utils import FrozenDict +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.utils import deprecate, logging @@ -23,7 +24,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class StableDiffusionMegaPipeline(DiffusionPipeline): +class StableDiffusionMegaPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -94,33 +95,6 @@ def __init__( def components(self) -> Dict[str, Any]: return {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")} - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, - `attention_head_dim` must be a multiple of `slice_size`. - """ - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `attention slicing` - self.enable_attention_slicing(None) - @torch.no_grad() def inpaint( self, diff --git a/examples/community/stable_diffusion_repaint.py b/examples/community/stable_diffusion_repaint.py index db2de0897570..02bef293bba8 100644 --- a/examples/community/stable_diffusion_repaint.py +++ b/examples/community/stable_diffusion_repaint.py @@ -24,14 +24,13 @@ from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel from diffusers.configuration_utils import FrozenDict, deprecate from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import ( StableDiffusionSafetyChecker, ) from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( - is_accelerate_available, - is_accelerate_version, logging, ) from diffusers.utils.torch_utils import randn_tensor @@ -140,7 +139,9 @@ def prepare_mask_and_masked_image(image, mask): return mask, masked_image -class StableDiffusionRepaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableDiffusionRepaintPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin +): r""" Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the @@ -276,80 +277,6 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload - def enable_sequential_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, - text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a - `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. - Note that offloading happens on a submodule basis. Memory savings are higher than with - `enable_model_cpu_offload`, but performance is lower. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): - from accelerate import cpu_offload - else: - raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") - - device = torch.device(f"cuda:{gpu_id}") - - if self.device.type != "cpu": - self.to("cpu", silence_dtype_warnings=True) - torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) - - for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: - cpu_offload(cpu_offloaded_model, device) - - if self.safety_checker is not None: - cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload - def enable_model_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared - to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` - method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with - `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. - """ - if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): - from accelerate import cpu_offload_with_hook - else: - raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") - - device = torch.device(f"cuda:{gpu_id}") - - if self.device.type != "cpu": - self.to("cpu", silence_dtype_warnings=True) - torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) - - hook = None - for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: - _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) - - if self.safety_checker is not None: - _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) - - # We'll offload the last model manually. - self.final_offload_hook = hook - - @property - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if not hasattr(self.unet, "_hf_hook"): - return self.device - for module in self.unet.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, diff --git a/examples/community/text_inpainting.py b/examples/community/text_inpainting.py index cd02049a4afb..ea4da966bb71 100644 --- a/examples/community/text_inpainting.py +++ b/examples/community/text_inpainting.py @@ -13,16 +13,17 @@ from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler -from diffusers.utils import deprecate, is_accelerate_available, logging +from diffusers.utils import deprecate, logging logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class TextInpainting(DiffusionPipeline): +class TextInpainting(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text based inpainting using Stable Diffusion. Uses CLIPSeg to get a mask from the given text, then calls the Inpainting pipeline with the generated mask @@ -120,69 +121,6 @@ def __init__( feature_extractor=feature_extractor, ) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, - `attention_head_dim` must be a multiple of `slice_size`. - """ - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `attention slicing` - self.enable_attention_slicing(None) - - def enable_sequential_cpu_offload(self): - r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, - text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a - `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. - """ - if is_accelerate_available(): - from accelerate import cpu_offload - else: - raise ImportError("Please install accelerate via `pip install accelerate`") - - device = torch.device("cuda") - - for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]: - if cpu_offloaded_model is not None: - cpu_offload(cpu_offloaded_model, device) - - @property - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"): - return self.device - for module in self.unet.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - @torch.no_grad() def __call__( self, diff --git a/examples/community/unclip_image_interpolation.py b/examples/community/unclip_image_interpolation.py index 95548b152c07..e3bb44e5030b 100644 --- a/examples/community/unclip_image_interpolation.py +++ b/examples/community/unclip_image_interpolation.py @@ -19,7 +19,7 @@ UNet2DModel, ) from diffusers.pipelines.unclip import UnCLIPTextProjModel -from diffusers.utils import is_accelerate_available, logging +from diffusers.utils import logging from diffusers.utils.torch_utils import randn_tensor @@ -204,50 +204,6 @@ def _encode_image(self, image, device, num_images_per_prompt, image_embeddings: return image_embeddings - # Copied from diffusers.pipelines.unclip.pipeline_unclip_image_variation.UnCLIPImageVariationPipeline.enable_sequential_cpu_offload - def enable_sequential_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's - models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only - when their specific submodule has its `forward` method called. - """ - if is_accelerate_available(): - from accelerate import cpu_offload - else: - raise ImportError("Please install accelerate via `pip install accelerate`") - - device = torch.device(f"cuda:{gpu_id}") - - models = [ - self.decoder, - self.text_proj, - self.text_encoder, - self.super_res_first, - self.super_res_last, - ] - for cpu_offloaded_model in models: - if cpu_offloaded_model is not None: - cpu_offload(cpu_offloaded_model, device) - - @property - # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._execution_device - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if self.device != torch.device("meta") or not hasattr(self.decoder, "_hf_hook"): - return self.device - for module in self.decoder.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - @torch.no_grad() def __call__( self, diff --git a/examples/community/unclip_text_interpolation.py b/examples/community/unclip_text_interpolation.py index 764299433b4c..be6a0858b35e 100644 --- a/examples/community/unclip_text_interpolation.py +++ b/examples/community/unclip_text_interpolation.py @@ -15,7 +15,7 @@ UNet2DModel, ) from diffusers.pipelines.unclip import UnCLIPTextProjModel -from diffusers.utils import is_accelerate_available, logging +from diffusers.utils import logging from diffusers.utils.torch_utils import randn_tensor @@ -212,51 +212,6 @@ def _encode_prompt( return prompt_embeds, text_encoder_hidden_states, text_mask - # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.enable_sequential_cpu_offload - def enable_sequential_cpu_offload(self, gpu_id=0): - r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's - models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only - when their specific submodule has its `forward` method called. - """ - if is_accelerate_available(): - from accelerate import cpu_offload - else: - raise ImportError("Please install accelerate via `pip install accelerate`") - - device = torch.device(f"cuda:{gpu_id}") - - # TODO: self.prior.post_process_latents is not covered by the offload hooks, so it fails if added to the list - models = [ - self.decoder, - self.text_proj, - self.text_encoder, - self.super_res_first, - self.super_res_last, - ] - for cpu_offloaded_model in models: - if cpu_offloaded_model is not None: - cpu_offload(cpu_offloaded_model, device) - - @property - # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._execution_device - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if self.device != torch.device("meta") or not hasattr(self.decoder, "_hf_hook"): - return self.device - for module in self.decoder.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - @torch.no_grad() def __call__( self, diff --git a/examples/community/wildcard_stable_diffusion.py b/examples/community/wildcard_stable_diffusion.py index 1a5ea350b857..241e661536d3 100644 --- a/examples/community/wildcard_stable_diffusion.py +++ b/examples/community/wildcard_stable_diffusion.py @@ -8,9 +8,9 @@ import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler @@ -63,7 +63,7 @@ class WildcardStableDiffusionOutput(StableDiffusionPipelineOutput): prompts: List[str] -class WildcardStableDiffusionPipeline(DiffusionPipeline): +class WildcardStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Example Usage: pipe = WildcardStableDiffusionPipeline.from_pretrained( diff --git a/examples/research_projects/controlnetxs/pipeline_controlnet_xs.py b/examples/research_projects/controlnetxs/pipeline_controlnet_xs.py index 32646c7c7715..88a586e9271d 100644 --- a/examples/research_projects/controlnetxs/pipeline_controlnet_xs.py +++ b/examples/research_projects/controlnetxs/pipeline_controlnet_xs.py @@ -26,7 +26,7 @@ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.models.lora import adjust_lora_scale_text_encoder -from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers @@ -44,7 +44,7 @@ class StableDiffusionControlNetXSPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin ): r""" Pipeline for text-to-image generation using Stable Diffusion with ControlNet-XS guidance. @@ -139,39 +139,6 @@ def __init__( ) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, @@ -596,34 +563,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype latents = latents * self.scheduler.init_noise_sigma return latents - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @torch.no_grad() def __call__( self, diff --git a/examples/research_projects/controlnetxs/pipeline_controlnet_xs_sd_xl.py b/examples/research_projects/controlnetxs/pipeline_controlnet_xs_sd_xl.py index b9b390f1c00c..d0186573fa9c 100644 --- a/examples/research_projects/controlnetxs/pipeline_controlnet_xs_sd_xl.py +++ b/examples/research_projects/controlnetxs/pipeline_controlnet_xs_sd_xl.py @@ -31,7 +31,7 @@ XFormersAttnProcessor, ) from diffusers.models.lora import adjust_lora_scale_text_encoder -from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( @@ -52,7 +52,11 @@ class StableDiffusionXLControlNetXSPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + StableDiffusionXLLoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet-XS guidance. @@ -145,39 +149,6 @@ def __init__( self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt def encode_prompt( self, @@ -661,34 +632,6 @@ def upcast_vae(self): self.vae.decoder.conv_in.to(dtype) self.vae.decoder.mid_block.to(dtype) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @torch.no_grad() def __call__( self, diff --git a/examples/research_projects/rdm/pipeline_rdm.py b/examples/research_projects/rdm/pipeline_rdm.py index 28b4cacb8319..dd97bf71b9db 100644 --- a/examples/research_projects/rdm/pipeline_rdm.py +++ b/examples/research_projects/rdm/pipeline_rdm.py @@ -17,16 +17,17 @@ LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel, - logging, ) from diffusers.image_processor import VaeImageProcessor -from diffusers.utils import is_accelerate_available, randn_tensor +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin +from diffusers.utils import logging +from diffusers.utils.torch_utils import randn_tensor logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class RDMPipeline(DiffusionPipeline): +class RDMPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text-to-image generation using Retrieval Augmented Diffusion. @@ -81,121 +82,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.retriever = retriever - def enable_xformers_memory_efficient_attention(self): - r""" - Enable memory efficient attention as implemented in xformers. - - When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference - time. Speed up at training time is not guaranteed. - - Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention - is used. - """ - self.unet.set_use_memory_efficient_attention_xformers(True) - - def disable_xformers_memory_efficient_attention(self): - r""" - Disable memory efficient attention as implemented in xformers. - """ - self.unet.set_use_memory_efficient_attention_xformers(False) - - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. - - When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several - steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. - - When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in - several steps. This is useful to save a large amount of memory and to allow the processing of larger images. - """ - self.vae.enable_tiling() - - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, - `attention_head_dim` must be a multiple of `slice_size`. - """ - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - if isinstance(self.unet.config.attention_head_dim, int): - slice_size = self.unet.config.attention_head_dim // 2 - else: - slice_size = self.unet.config.attention_head_dim[0] // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `attention slicing` - self.enable_attention_slicing(None) - - def enable_sequential_cpu_offload(self): - r""" - Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, - text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a - `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. - """ - if is_accelerate_available(): - from accelerate import cpu_offload - else: - raise ImportError("Please install accelerate via `pip install accelerate`") - - device = torch.device("cuda") - - for cpu_offloaded_model in [self.unet, self.clip, self.vae]: - if cpu_offloaded_model is not None: - cpu_offload(cpu_offloaded_model, device) - - @property - def _execution_device(self): - r""" - Returns the device on which the pipeline's models will be executed. After calling - `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module - hooks. - """ - if not hasattr(self.unet, "_hf_hook"): - return self.device - for module in self.unet.modules(): - if ( - hasattr(module, "_hf_hook") - and hasattr(module._hf_hook, "execution_device") - and module._hf_hook.execution_device is not None - ): - return torch.device(module._hf_hook.execution_device) - return self.device - def _encode_prompt(self, prompt): # get prompt text embeddings text_inputs = self.tokenizer( diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 28fde1248d78..165bbf263b29 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -128,6 +128,7 @@ "PNDMPipeline", "RePaintPipeline", "ScoreSdeVePipeline", + "StableDiffusionMixin", ] ) _import_structure["schedulers"].extend( @@ -514,6 +515,7 @@ PNDMPipeline, RePaintPipeline, ScoreSdeVePipeline, + StableDiffusionMixin, ) from .schedulers import ( AmusedScheduler, diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 62d764c5edbe..5ec8876fc114 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -116,6 +116,8 @@ def __init__( super().__init__() self.inner_dim = out_dim if out_dim is not None else dim_head * heads self.query_dim = query_dim + self.use_bias = bias + self.is_cross_attention = cross_attention_dim is not None self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim self.upcast_attention = upcast_attention self.upcast_softmax = upcast_softmax @@ -697,27 +699,32 @@ def norm_encoder_hidden_states(self, encoder_hidden_states: torch.Tensor) -> tor @torch.no_grad() def fuse_projections(self, fuse=True): - is_cross_attention = self.cross_attention_dim != self.query_dim device = self.to_q.weight.data.device dtype = self.to_q.weight.data.dtype - if not is_cross_attention: + if not self.is_cross_attention: # fetch weight matrices. concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data]) in_features = concatenated_weights.shape[1] out_features = concatenated_weights.shape[0] # create a new single projection layer and copy over the weights. - self.to_qkv = self.linear_cls(in_features, out_features, bias=False, device=device, dtype=dtype) + self.to_qkv = self.linear_cls(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype) self.to_qkv.weight.copy_(concatenated_weights) + if self.use_bias: + concatenated_bias = torch.cat([self.to_q.bias.data, self.to_k.bias.data, self.to_v.bias.data]) + self.to_qkv.bias.copy_(concatenated_bias) else: concatenated_weights = torch.cat([self.to_k.weight.data, self.to_v.weight.data]) in_features = concatenated_weights.shape[1] out_features = concatenated_weights.shape[0] - self.to_kv = self.linear_cls(in_features, out_features, bias=False, device=device, dtype=dtype) + self.to_kv = self.linear_cls(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype) self.to_kv.weight.copy_(concatenated_weights) + if self.use_bias: + concatenated_bias = torch.cat([self.to_k.bias.data, self.to_v.bias.data]) + self.to_kv.bias.copy_(concatenated_bias) self.fused_projections = fuse diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py index 1d5bd57cf8e0..b7641a96a7a1 100644 --- a/src/diffusers/models/unets/unet_3d_condition.py +++ b/src/diffusers/models/unets/unet_3d_condition.py @@ -27,6 +27,7 @@ from ..attention_processor import ( ADDED_KV_ATTENTION_PROCESSORS, CROSS_ATTENTION_PROCESSORS, + Attention, AttentionProcessor, AttnAddedKVProcessor, AttnProcessor, @@ -503,6 +504,44 @@ def disable_freeu(self): if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None: setattr(upsample_block, k, None) + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections + def fuse_qkv_projections(self): + """ + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, + key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + + + + This API is 🧪 experimental. + + + """ + self.original_attn_processors = None + + for _, attn_processor in self.attn_processors.items(): + if "Added" in str(attn_processor.__class__.__name__): + raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.") + + self.original_attn_processors = self.attn_processors + + for module in self.modules(): + if isinstance(module, Attention): + module.fuse_projections(fuse=True) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections + def unfuse_qkv_projections(self): + """Disables the fused QKV projection if enabled. + + + + This API is 🧪 experimental. + + + + """ + if self.original_attn_processors is not None: + self.set_attn_processor(self.original_attn_processors) + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unload_lora def unload_lora(self): """Unloads LoRA weights.""" diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py index 5dce87254986..a096f842ab6c 100644 --- a/src/diffusers/models/unets/unet_i2vgen_xl.py +++ b/src/diffusers/models/unets/unet_i2vgen_xl.py @@ -474,6 +474,44 @@ def disable_freeu(self): if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None: setattr(upsample_block, k, None) + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections + def fuse_qkv_projections(self): + """ + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, + key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + + + + This API is 🧪 experimental. + + + """ + self.original_attn_processors = None + + for _, attn_processor in self.attn_processors.items(): + if "Added" in str(attn_processor.__class__.__name__): + raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.") + + self.original_attn_processors = self.attn_processors + + for module in self.modules(): + if isinstance(module, Attention): + module.fuse_projections(fuse=True) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections + def unfuse_qkv_projections(self): + """Disables the fused QKV projection if enabled. + + + + This API is 🧪 experimental. + + + + """ + if self.original_attn_processors is not None: + self.set_attn_processor(self.original_attn_processors) + def forward( self, sample: torch.FloatTensor, diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py index 246a4b8124d8..ab2eac4c9a9a 100644 --- a/src/diffusers/models/unets/unet_motion_model.py +++ b/src/diffusers/models/unets/unet_motion_model.py @@ -23,6 +23,7 @@ from ..attention_processor import ( ADDED_KV_ATTENTION_PROCESSORS, CROSS_ATTENTION_PROCESSORS, + Attention, AttentionProcessor, AttnAddedKVProcessor, AttnProcessor, @@ -701,6 +702,44 @@ def disable_freeu(self) -> None: if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None: setattr(upsample_block, k, None) + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections + def fuse_qkv_projections(self): + """ + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, + key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + + + + This API is 🧪 experimental. + + + """ + self.original_attn_processors = None + + for _, attn_processor in self.attn_processors.items(): + if "Added" in str(attn_processor.__class__.__name__): + raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.") + + self.original_attn_processors = self.attn_processors + + for module in self.modules(): + if isinstance(module, Attention): + module.fuse_projections(fuse=True) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections + def unfuse_qkv_projections(self): + """Disables the fused QKV projection if enabled. + + + + This API is 🧪 experimental. + + + + """ + if self.original_attn_processors is not None: + self.set_attn_processor(self.original_attn_processors) + def forward( self, sample: torch.FloatTensor, diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 1bf41aeaf0df..a1840201f8ba 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -48,6 +48,7 @@ _import_structure["pipeline_utils"] = [ "AudioPipelineOutput", "DiffusionPipeline", + "StableDiffusionMixin", "ImagePipelineOutput", ] _import_structure["deprecated"].extend( @@ -329,6 +330,7 @@ AudioPipelineOutput, DiffusionPipeline, ImagePipelineOutput, + StableDiffusionMixin, ) try: diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py index c794bd00ce85..db52e75fce23 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py @@ -42,7 +42,7 @@ ) from ...utils.torch_utils import randn_tensor from ..free_init_utils import FreeInitMixin -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from .pipeline_output import AnimateDiffPipelineOutput @@ -87,7 +87,12 @@ def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: class AnimateDiffPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FreeInitMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + IPAdapterMixin, + LoraLoaderMixin, + FreeInitMixin, ): r""" Pipeline for text-to-video generation. @@ -411,66 +416,6 @@ def decode_latents(self, latents): video = video.float() return video - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py index 4b5cc12b1265..2963d181b274 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py @@ -35,7 +35,7 @@ from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers from ...utils.torch_utils import randn_tensor from ..free_init_utils import FreeInitMixin -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from .pipeline_output import AnimateDiffPipelineOutput @@ -165,7 +165,12 @@ def retrieve_timesteps( class AnimateDiffVideoToVideoPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FreeInitMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + IPAdapterMixin, + LoraLoaderMixin, + FreeInitMixin, ): r""" Pipeline for video-to-video generation. @@ -489,67 +494,6 @@ def decode_latents(self, latents): video = video.float() return video - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py index 438f6736b6a7..69bebdd0dc4f 100644 --- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py +++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py @@ -24,7 +24,7 @@ from ...schedulers import KarrasDiffusionSchedulers from ...utils import logging, replace_example_docstring from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline +from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, StableDiffusionMixin logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -49,7 +49,7 @@ """ -class AudioLDMPipeline(DiffusionPipeline): +class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text-to-audio generation using AudioLDM. @@ -96,22 +96,6 @@ def __init__( ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - def _encode_prompt( self, prompt, diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index dc6df780005e..e01aa9929dd8 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -173,7 +173,7 @@ def __init__( ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing + # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_slicing def enable_vae_slicing(self): r""" Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to @@ -181,7 +181,7 @@ def enable_vae_slicing(self): """ self.vae.enable_slicing() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing + # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_slicing def disable_vae_slicing(self): r""" Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index c8af65c78505..0f5fcfeca3c1 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -36,7 +36,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker from .multicontrolnet import MultiControlNetModel @@ -137,7 +137,12 @@ def retrieve_timesteps( class StableDiffusionControlNetPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + LoraLoaderMixin, + IPAdapterMixin, + FromSingleFileMixin, ): r""" Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance. @@ -233,39 +238,6 @@ def __init__( ) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, @@ -824,34 +796,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype latents = latents * self.scheduler.init_noise_sigma return latents - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 377af876aaeb..fe672db5bf8e 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -35,7 +35,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import is_compiled_module, randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionPipelineOutput from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker from .multicontrolnet import MultiControlNetModel @@ -130,7 +130,12 @@ def prepare_image(image): class StableDiffusionControlNetImg2ImgPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + LoraLoaderMixin, + IPAdapterMixin, + FromSingleFileMixin, ): r""" Pipeline for image-to-image generation using Stable Diffusion with ControlNet guidance. @@ -226,39 +231,6 @@ def __init__( ) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, @@ -866,34 +838,6 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt return latents - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @property def guidance_scale(self): return self._guidance_scale diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index b23f78a8b3fd..80a430b15592 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -37,7 +37,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import is_compiled_module, randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionPipelineOutput from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker from .multicontrolnet import MultiControlNetModel @@ -241,7 +241,12 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False class StableDiffusionControlNetInpaintPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + LoraLoaderMixin, + IPAdapterMixin, + FromSingleFileMixin, ): r""" Pipeline for image inpainting using Stable Diffusion with ControlNet guidance. @@ -351,39 +356,6 @@ def __init__( ) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, @@ -1076,34 +1048,6 @@ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): return image_latents - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @property def guidance_scale(self): return self._guidance_scale diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py index c4787eed62be..e42977a7b55a 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py @@ -53,7 +53,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import is_compiled_module, randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput from .multicontrolnet import MultiControlNetModel @@ -151,7 +151,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): class StableDiffusionXLControlNetInpaintPipeline( - DiffusionPipeline, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, IPAdapterMixin + DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, IPAdapterMixin ): r""" Pipeline for text-to-image generation using Stable Diffusion XL. @@ -245,39 +245,6 @@ def __init__( else: self.watermark = None - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt def encode_prompt( self, @@ -1104,34 +1071,6 @@ def upcast_vae(self): self.vae.decoder.conv_in.to(dtype) self.vae.decoder.mid_block.to(dtype) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @property def guidance_scale(self): return self._guidance_scale diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py index 7c5a6e39abd4..4e0a880a4a11 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py @@ -55,7 +55,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput @@ -116,6 +116,7 @@ class StableDiffusionXLControlNetPipeline( DiffusionPipeline, + StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, IPAdapterMixin, @@ -222,39 +223,6 @@ def __init__( self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt def encode_prompt( self, @@ -873,34 +841,6 @@ def upcast_vae(self): self.vae.decoder.conv_in.to(dtype) self.vae.decoder.mid_block.to(dtype) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index 273297514a16..41a8c4fa005e 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -54,7 +54,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import is_compiled_module, randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput @@ -157,7 +157,11 @@ def retrieve_latents( class StableDiffusionXLControlNetImg2ImgPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, IPAdapterMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + StableDiffusionXLLoraLoaderMixin, + IPAdapterMixin, ): r""" Pipeline for image-to-image generation using Stable Diffusion XL with ControlNet guidance. @@ -271,39 +275,6 @@ def __init__( self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt def encode_prompt( self, @@ -1030,34 +1001,6 @@ def upcast_vae(self): self.vae.decoder.conv_in.to(dtype) self.vae.decoder.mid_block.to(dtype) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @property def guidance_scale(self): return self._guidance_scale diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py index 1d377dd97855..e4583699e79e 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py @@ -23,7 +23,6 @@ from ....image_processor import PipelineImageInput, VaeImageProcessor from ....loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ....models import AutoencoderKL, ImageProjection, UNet2DConditionModel -from ....models.attention_processor import FusedAttnProcessor2_0 from ....models.lora import adjust_lora_scale_text_encoder from ....schedulers import KarrasDiffusionSchedulers from ....utils import ( @@ -35,7 +34,7 @@ unscale_lora_layers, ) from ....utils.torch_utils import randn_tensor -from ...pipeline_utils import DiffusionPipeline +from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker from .modeling_roberta_series import RobertaSeriesModelWithTransformation from .pipeline_output import AltDiffusionPipelineOutput @@ -120,7 +119,12 @@ def retrieve_timesteps( class AltDiffusionPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + LoraLoaderMixin, + IPAdapterMixin, + FromSingleFileMixin, ): r""" Pipeline for text-to-image generation using Alt Diffusion. @@ -252,35 +256,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - def _encode_prompt( self, prompt, @@ -629,91 +604,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype latents = latents * self.scheduler.init_noise_sigma return latents - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Alt Diffusion v1, v2, and Alt Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - - def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - """ - self.fusing_unet = False - self.fusing_vae = False - - if unet: - self.fusing_unet = True - self.unet.fuse_qkv_projections() - self.unet.set_attn_processor(FusedAttnProcessor2_0()) - - if vae: - if not isinstance(self.vae, AutoencoderKL): - raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") - - self.fusing_vae = True - self.vae.fuse_qkv_projections() - self.vae.set_attn_processor(FusedAttnProcessor2_0()) - - def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """Disable QKV projection fusion if enabled. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - - """ - if unet: - if not self.fusing_unet: - logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") - else: - self.unet.unfuse_qkv_projections() - self.fusing_unet = False - - if vae: - if not self.fusing_vae: - logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") - else: - self.vae.unfuse_qkv_projections() - self.fusing_vae = False - def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py index a9f058bb240b..156e52c249d9 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -25,7 +25,6 @@ from ....image_processor import PipelineImageInput, VaeImageProcessor from ....loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ....models import AutoencoderKL, ImageProjection, UNet2DConditionModel -from ....models.attention_processor import FusedAttnProcessor2_0 from ....models.lora import adjust_lora_scale_text_encoder from ....schedulers import KarrasDiffusionSchedulers from ....utils import ( @@ -38,7 +37,7 @@ unscale_lora_layers, ) from ....utils.torch_utils import randn_tensor -from ...pipeline_utils import DiffusionPipeline +from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker from .modeling_roberta_series import RobertaSeriesModelWithTransformation from .pipeline_output import AltDiffusionPipelineOutput @@ -160,7 +159,12 @@ def retrieve_timesteps( class AltDiffusionImg2ImgPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + IPAdapterMixin, + LoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for text-guided image-to-image generation using Alt Diffusion. @@ -689,91 +693,6 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt return latents - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Alt Diffusion v1, v2, and Alt Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - - def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - """ - self.fusing_unet = False - self.fusing_vae = False - - if unet: - self.fusing_unet = True - self.unet.fuse_qkv_projections() - self.unet.set_attn_processor(FusedAttnProcessor2_0()) - - if vae: - if not isinstance(self.vae, AutoencoderKL): - raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") - - self.fusing_vae = True - self.vae.fuse_qkv_projections() - self.vae.set_attn_processor(FusedAttnProcessor2_0()) - - def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """Disable QKV projection fusion if enabled. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - - """ - if unet: - if not self.fusing_unet: - logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") - else: - self.unet.unfuse_qkv_projections() - self.fusing_unet = False - - if vae: - if not self.fusing_vae: - logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") - else: - self.vae.unfuse_qkv_projections() - self.fusing_vae = False - def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py index e61c35f9c504..dee93fc2eb53 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py @@ -26,7 +26,7 @@ from ....schedulers.scheduling_utils import SchedulerMixin from ....utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers from ....utils.torch_utils import randn_tensor -from ...pipeline_utils import DiffusionPipeline +from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -36,7 +36,9 @@ AUGS_CONST = ["A photo of ", "An image of ", "A picture of "] -class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableDiffusionModelEditingPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin +): r""" Pipeline for text-to-image model editing. @@ -153,22 +155,6 @@ def append_ca(net_): self.projection_matrices = self.projection_matrices + [l.to_k for l in self.ca_clip_layers] self.og_matrices = self.og_matrices + [copy.deepcopy(l.to_k) for l in self.ca_clip_layers] - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py index a37f2870cb02..ddc866ef9b86 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py @@ -32,7 +32,7 @@ unscale_lora_layers, ) from ....utils.torch_utils import randn_tensor -from ...pipeline_utils import DiffusionPipeline +from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -63,7 +63,7 @@ class StableDiffusionParadigmsPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin ): r""" Pipeline for text-to-image generation using a parallelized version of Stable Diffusion. @@ -146,39 +146,6 @@ def __init__( # attribute to wrap the unet with torch.nn.DataParallel when running multiple denoising steps on multiple GPUs self.wrapped_unet = self.unet - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py index c2e2369f27f8..c819e5728181 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py @@ -46,7 +46,7 @@ unscale_lora_layers, ) from ....utils.torch_utils import randn_tensor -from ...pipeline_utils import DiffusionPipeline +from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -280,7 +280,7 @@ def __call__( return hidden_states -class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline): +class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for pixel-level image editing using Pix2Pix Zero. Based on Stable Diffusion. diff --git a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py index 5354f6643cb7..2df21533962c 100644 --- a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +++ b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py @@ -31,7 +31,7 @@ replace_example_docstring, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -103,7 +103,10 @@ class I2VGenXLPipelineOutput(BaseOutput): frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]] -class I2VGenXLPipeline(DiffusionPipeline): +class I2VGenXLPipeline( + DiffusionPipeline, + StableDiffusionMixin, +): r""" Pipeline for image-to-video generation as proposed in [I2VGenXL](https://i2vgen-xl.github.io/). @@ -161,39 +164,6 @@ def guidance_scale(self): def do_classifier_free_guidance(self): return self._guidance_scale > 1 - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - def encode_prompt( self, prompt, @@ -542,34 +512,6 @@ def prepare_latents( latents = latents * self.scheduler.init_noise_sigma return latents - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index f914020dd505..e6adae97ea6c 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -36,7 +36,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker @@ -129,7 +129,12 @@ def retrieve_timesteps( class LatentConsistencyModelImg2ImgPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + IPAdapterMixin, + LoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for image-to-image generation using a latent consistency model. @@ -209,67 +214,6 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt def encode_prompt( self, diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py index 967d845367d4..0112fb916369 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py @@ -35,7 +35,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker @@ -107,7 +107,12 @@ def retrieve_timesteps( class LatentConsistencyModelPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + IPAdapterMixin, + LoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for text-to-image generation using a latent consistency model. @@ -193,67 +198,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt def encode_prompt( self, diff --git a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py index 69bd0521d558..5fde3450b9a0 100644 --- a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py +++ b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py @@ -36,7 +36,7 @@ replace_example_docstring, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline +from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, StableDiffusionMixin if is_librosa_available(): @@ -64,7 +64,7 @@ """ -class MusicLDMPipeline(DiffusionPipeline): +class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text-to-audio generation using MusicLDM. @@ -113,22 +113,6 @@ def __init__( ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - def _encode_prompt( self, prompt, diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py index 8effa94849c9..8a24f134e793 100644 --- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py @@ -25,7 +25,7 @@ from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ...utils import deprecate, logging from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionPipelineOutput from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker from .image_encoder import PaintByExampleImageEncoder @@ -148,7 +148,7 @@ def prepare_mask_and_masked_image(image, mask): return mask, masked_image -class PaintByExamplePipeline(DiffusionPipeline): +class PaintByExamplePipeline(DiffusionPipeline, StableDiffusionMixin): r""" diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index 071caa1a33dd..9feebf29d797 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -46,7 +46,7 @@ ) from ...utils.torch_utils import randn_tensor from ..free_init_utils import FreeInitMixin -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -211,7 +211,13 @@ class PIAPipelineOutput(BaseOutput): class PIAPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin, FreeInitMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + IPAdapterMixin, + LoraLoaderMixin, + FromSingleFileMixin, + FreeInitMixin, ): r""" Pipeline for text-to-video generation. @@ -500,67 +506,6 @@ def decode_latents(self, latents): video = video.float() return video - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index adb32a782b3e..afdd7ab2161e 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -42,6 +42,8 @@ from .. import __version__ from ..configuration_utils import ConfigMixin +from ..models import AutoencoderKL +from ..models.attention_processor import FusedAttnProcessor2_0 from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from ..utils import ( @@ -2093,3 +2095,123 @@ def set_attention_slice(self, slice_size: Optional[int]): for module in modules: module.set_attention_slice(slice_size) + + +class StableDiffusionMixin: + r""" + Helper for DiffusionPipeline with vae and unet.(mainly for LDM such as stable diffusion) + """ + + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to + compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to + compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow + processing larger images. + """ + self.vae.enable_tiling() + + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + + def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): + r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. + + The suffixes after the scaling factors represent the stages where they are being applied. + + Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values + that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. + + Args: + s1 (`float`): + Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to + mitigate "oversmoothing effect" in the enhanced denoising process. + s2 (`float`): + Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to + mitigate "oversmoothing effect" in the enhanced denoising process. + b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. + b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. + """ + if not hasattr(self, "unet"): + raise ValueError("The pipeline must have `unet` for using FreeU.") + self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) + + def disable_freeu(self): + """Disables the FreeU mechanism if enabled.""" + self.unet.disable_freeu() + + def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): + """ + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, + key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + + + + This API is 🧪 experimental. + + + + Args: + unet (`bool`, defaults to `True`): To apply fusion on the UNet. + vae (`bool`, defaults to `True`): To apply fusion on the VAE. + """ + self.fusing_unet = False + self.fusing_vae = False + + if unet: + self.fusing_unet = True + self.unet.fuse_qkv_projections() + self.unet.set_attn_processor(FusedAttnProcessor2_0()) + + if vae: + if not isinstance(self.vae, AutoencoderKL): + raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") + + self.fusing_vae = True + self.vae.fuse_qkv_projections() + self.vae.set_attn_processor(FusedAttnProcessor2_0()) + + def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): + """Disable QKV projection fusion if enabled. + + + + This API is 🧪 experimental. + + + + Args: + unet (`bool`, defaults to `True`): To apply fusion on the UNet. + vae (`bool`, defaults to `True`): To apply fusion on the VAE. + + """ + if unet: + if not self.fusing_unet: + logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") + else: + self.unet.unfuse_qkv_projections() + self.fusing_unet = False + + if vae: + if not self.fusing_vae: + logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") + else: + self.vae.unfuse_qkv_projections() + self.fusing_vae = False diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py index a1cb3f5af378..f0e25264ffa7 100644 --- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py @@ -11,14 +11,14 @@ from ...schedulers import KarrasDiffusionSchedulers from ...utils import deprecate, logging from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from .pipeline_output import SemanticStableDiffusionPipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class SemanticStableDiffusionPipeline(DiffusionPipeline): +class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text-to-image generation using Stable Diffusion with latent editing. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index a62b050afe92..762565ea1fd3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -23,7 +23,6 @@ from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel -from ...models.attention_processor import FusedAttnProcessor2_0 from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -35,7 +34,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from .pipeline_output import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -116,7 +115,12 @@ def retrieve_timesteps( class StableDiffusionPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + LoraLoaderMixin, + IPAdapterMixin, + FromSingleFileMixin, ): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -248,35 +252,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - def _encode_prompt( self, prompt, @@ -666,93 +641,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype latents = latents * self.scheduler.init_noise_sigma return latents - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections - def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - """ - self.fusing_unet = False - self.fusing_vae = False - - if unet: - self.fusing_unet = True - self.unet.fuse_qkv_projections() - self.unet.set_attn_processor(FusedAttnProcessor2_0()) - - if vae: - if not isinstance(self.vae, AutoencoderKL): - raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") - - self.fusing_vae = True - self.vae.fuse_qkv_projections() - self.vae.set_attn_processor(FusedAttnProcessor2_0()) - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections - def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """Disable QKV projection fusion if enabled. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - - """ - if unet: - if not self.fusing_unet: - logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") - else: - self.unet.unfuse_qkv_projections() - self.fusing_unet = False - - if vae: - if not self.fusing_vae: - logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") - else: - self.vae.unfuse_qkv_projections() - self.fusing_vae = False - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py index fa797a7d9f3a..1333cb825750 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py @@ -26,7 +26,7 @@ from ...schedulers import KarrasDiffusionSchedulers from ...utils import deprecate, logging from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -34,7 +34,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class StableDiffusionImageVariationPipeline(DiffusionPipeline): +class StableDiffusionImageVariationPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline to generate image variations from an input image using Stable Diffusion. @@ -240,34 +240,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype latents = latents * self.scheduler.init_noise_sigma return latents - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @torch.no_grad() def __call__( self, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 5c6e67d7282b..23b6e0f20520 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -25,7 +25,6 @@ from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel -from ...models.attention_processor import FusedAttnProcessor2_0 from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -38,7 +37,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -156,7 +155,12 @@ def retrieve_timesteps( class StableDiffusionImg2ImgPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + IPAdapterMixin, + LoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for text-guided image-to-image generation using Stable Diffusion. @@ -288,39 +292,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, @@ -768,95 +739,6 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt return latents - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections - def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - """ - self.fusing_unet = False - self.fusing_vae = False - - if unet: - self.fusing_unet = True - self.unet.fuse_qkv_projections() - self.unet.set_attn_processor(FusedAttnProcessor2_0()) - - if vae: - if not isinstance(self.vae, AutoencoderKL): - raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") - - self.fusing_vae = True - self.vae.fuse_qkv_projections() - self.vae.set_attn_processor(FusedAttnProcessor2_0()) - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections - def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """Disable QKV projection fusion if enabled. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - - """ - if unet: - if not self.fusing_unet: - logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") - else: - self.unet.unfuse_qkv_projections() - self.fusing_unet = False - - if vae: - if not self.fusing_vae: - logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") - else: - self.vae.unfuse_qkv_projections() - self.fusing_vae = False - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 354950810e69..4664da1550f2 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -25,12 +25,11 @@ from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AsymmetricAutoencoderKL, AutoencoderKL, ImageProjection, UNet2DConditionModel -from ...models.attention_processor import FusedAttnProcessor2_0 from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -220,7 +219,12 @@ def retrieve_timesteps( class StableDiffusionInpaintPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + IPAdapterMixin, + LoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for text-guided image inpainting using Stable Diffusion. @@ -360,39 +364,6 @@ def __init__( ) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, @@ -910,95 +881,6 @@ def get_timesteps(self, num_inference_steps, strength, device): return timesteps, num_inference_steps - t_start - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections - def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - """ - self.fusing_unet = False - self.fusing_vae = False - - if unet: - self.fusing_unet = True - self.unet.fuse_qkv_projections() - self.unet.set_attn_processor(FusedAttnProcessor2_0()) - - if vae: - if not isinstance(self.vae, AutoencoderKL): - raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") - - self.fusing_vae = True - self.vae.fuse_qkv_projections() - self.vae.set_attn_processor(FusedAttnProcessor2_0()) - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections - def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """Disable QKV projection fusion if enabled. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - - """ - if unet: - if not self.fusing_unet: - logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") - else: - self.unet.unfuse_qkv_projections() - self.fusing_unet = False - - if vae: - if not self.fusing_vae: - logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") - else: - self.vae.unfuse_qkv_projections() - self.fusing_vae = False - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index f4bb8267aac7..89d4278937fe 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -26,7 +26,7 @@ from ...schedulers import KarrasDiffusionSchedulers from ...utils import PIL_INTERPOLATION, deprecate, logging from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -73,7 +73,7 @@ def retrieve_latents( class StableDiffusionInstructPix2PixPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin ): r""" Pipeline for pixel-level image editing by following text instructions (based on Stable Diffusion). @@ -807,34 +807,6 @@ def prepare_image_latents( return image_latents - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @property def guidance_scale(self): return self._guidance_scale diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index 8d272fa5748c..918dffe5199d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -27,7 +27,7 @@ from ...schedulers import EulerDiscreteScheduler from ...utils import deprecate, logging from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -60,7 +60,7 @@ def preprocess(image): return image -class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, FromSingleFileMixin): +class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin): r""" Pipeline for upscaling Stable Diffusion output image resolution by a factor of 2. @@ -258,34 +258,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype latents = latents * self.scheduler.init_noise_sigma return latents - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @torch.no_grad() def __call__( self, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index f2b77a6d17b9..2d04cf41d9b5 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -34,7 +34,7 @@ from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from . import StableDiffusionPipelineOutput @@ -68,7 +68,7 @@ def preprocess(image): class StableDiffusionUpscalePipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin ): r""" Pipeline for text-guided image super-resolution using Stable Diffusion 2. @@ -530,34 +530,6 @@ def upcast_vae(self): self.vae.decoder.conv_in.to(dtype) self.vae.decoder.mid_block.to(dtype) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @torch.no_grad() def __call__( self, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index 8b66fa0f1972..c62e0f4ec50f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -34,7 +34,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer @@ -58,7 +58,7 @@ """ -class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin): """ Pipeline for text-to-image generation using stable unCLIP. @@ -155,22 +155,6 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._encode_prompt with _encode_prompt->_encode_prior_prompt, tokenizer->prior_tokenizer, text_encoder->prior_text_encoder def _encode_prior_prompt( self, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index feb482fb429c..9b85d9e6b1a4 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -34,7 +34,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer @@ -69,7 +69,9 @@ """ -class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableUnCLIPImg2ImgPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin +): """ Pipeline for text-guided image-to-image generation using stable unCLIP. @@ -156,22 +158,6 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, diff --git a/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py index a6e593282996..03c80b46b806 100644 --- a/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py @@ -36,7 +36,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionPipelineOutput from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -170,7 +170,7 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a return hidden_states -class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion and Attend-and-Excite. @@ -246,22 +246,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, diff --git a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py index a6724e44334f..4c90ce0646c4 100644 --- a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +++ b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py @@ -39,7 +39,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionPipelineOutput from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -235,7 +235,9 @@ def preprocess_mask(mask, batch_size: int = 1): return mask -class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableDiffusionDiffEditPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin +): r""" @@ -371,39 +373,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py index 138e002bf0eb..9f0d1190fd87 100644 --- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py @@ -35,7 +35,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionPipelineOutput from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -99,7 +99,7 @@ """ -class StableDiffusionGLIGENPipeline(DiffusionPipeline): +class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text-to-image generation using Stable Diffusion with Grounded-Language-to-Image Generation (GLIGEN). @@ -172,35 +172,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) self.register_to_config(requires_safety_checker=requires_safety_checker) - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py index 6bd67a06cbbd..bbffaf2884a3 100644 --- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py @@ -34,7 +34,7 @@ from ...schedulers import KarrasDiffusionSchedulers from ...utils import USE_PEFT_BACKEND, logging, replace_example_docstring, scale_lora_layers, unscale_lora_layers from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionPipelineOutput from ..stable_diffusion.clip_image_project_model import CLIPImageProjection from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -145,7 +145,7 @@ """ -class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline): +class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text-to-image generation using Stable Diffusion with Grounded-Language-to-Image Generation (GLIGEN). @@ -230,35 +230,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) self.register_to_config(requires_safety_checker=requires_safety_checker) - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt def encode_prompt( self, diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py index 602deeef194f..bc565c938a30 100755 --- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -26,7 +26,7 @@ from ...schedulers import LMSDiscreteScheduler from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionPipelineOutput @@ -47,7 +47,9 @@ def apply_model(self, *args, **kwargs): return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample -class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableDiffusionKDiffusionPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin +): r""" Pipeline for text-to-image generation using Stable Diffusion. diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py index 8b83c9aec43a..ed46a1e36b60 100644 --- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py @@ -50,7 +50,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput @@ -91,6 +91,7 @@ def apply_model(self, *args, **kwargs): class StableDiffusionXLKDiffusionPipeline( DiffusionPipeline, + StableDiffusionMixin, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin, @@ -196,39 +197,6 @@ def set_scheduler(self, scheduler_type: str): raise ValueError(f"Invalid scheduler type {scheduler_type}. Please choose one of {valid_samplers}.") - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt def encode_prompt( self, @@ -582,94 +550,6 @@ def upcast_vae(self): self.vae.decoder.conv_in.to(dtype) self.vae.decoder.mid_block.to(dtype) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.fuse_qkv_projections - def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - """ - self.fusing_unet = False - self.fusing_vae = False - - if unet: - self.fusing_unet = True - self.unet.fuse_qkv_projections() - self.unet.set_attn_processor(FusedAttnProcessor2_0()) - - if vae: - if not isinstance(self.vae, AutoencoderKL): - raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") - - self.fusing_vae = True - self.vae.fuse_qkv_projections() - self.vae.set_attn_processor(FusedAttnProcessor2_0()) - - def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """Disable QKV projection fusion if enabled. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - - """ - if unet: - if not self.fusing_unet: - logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") - else: - self.unet.unfuse_qkv_projections() - self.fusing_unet = False - - if vae: - if not self.fusing_vae: - logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") - else: - self.vae.unfuse_qkv_projections() - self.fusing_vae = False - @property def guidance_scale(self): return self._guidance_scale diff --git a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py index 6553e9786488..ddbf9ebbb1d0 100644 --- a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py @@ -36,7 +36,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -82,7 +82,12 @@ class LDM3DPipelineOutput(BaseOutput): class StableDiffusionLDM3DPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + IPAdapterMixin, + LoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for text-to-image and 3D generation using LDM3D. @@ -165,39 +170,6 @@ def __init__( self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, diff --git a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py index 3773ea6e9728..1b371b4746ad 100644 --- a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py @@ -32,7 +32,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionPipelineOutput from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -59,7 +59,9 @@ """ -class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin): +class StableDiffusionPanoramaPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin +): r""" Pipeline for text-to-image generation using MultiDiffusion. @@ -140,22 +142,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py index d72698cdc6a3..24c648a813ba 100644 --- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py @@ -14,7 +14,7 @@ from ...schedulers import KarrasDiffusionSchedulers from ...utils import deprecate, logging from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from . import StableDiffusionSafePipelineOutput from .safety_checker import SafeStableDiffusionSafetyChecker @@ -22,7 +22,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin): +class StableDiffusionPipelineSafe(DiffusionPipeline, StableDiffusionMixin, IPAdapterMixin): r""" Pipeline based on the [`StableDiffusionPipeline`] for text-to-image generation using Safe Latent Diffusion. diff --git a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py index 435bbca4d7d2..878a3fdac211 100644 --- a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py @@ -33,7 +33,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionPipelineOutput from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -98,7 +98,7 @@ def __call__( # Modified to get self-attention guidance scale in this paper (https://arxiv.org/pdf/2210.00939.pdf) as an input -class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin): +class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -161,22 +161,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 14376cc2d9ca..98028f8be50e 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -52,7 +52,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from .pipeline_output import StableDiffusionXLPipelineOutput @@ -148,6 +148,7 @@ def retrieve_timesteps( class StableDiffusionXLPipeline( DiffusionPipeline, + StableDiffusionMixin, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin, @@ -257,39 +258,6 @@ def __init__( else: self.watermark = None - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - def encode_prompt( self, prompt: str, @@ -744,93 +712,6 @@ def upcast_vae(self): self.vae.decoder.conv_in.to(dtype) self.vae.decoder.mid_block.to(dtype) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - - def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - """ - self.fusing_unet = False - self.fusing_vae = False - - if unet: - self.fusing_unet = True - self.unet.fuse_qkv_projections() - self.unet.set_attn_processor(FusedAttnProcessor2_0()) - - if vae: - if not isinstance(self.vae, AutoencoderKL): - raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") - - self.fusing_vae = True - self.vae.fuse_qkv_projections() - self.vae.set_attn_processor(FusedAttnProcessor2_0()) - - def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """Disable QKV projection fusion if enabled. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - - """ - if unet: - if not self.fusing_unet: - logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") - else: - self.unet.unfuse_qkv_projections() - self.fusing_unet = False - - if vae: - if not self.fusing_vae: - logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") - else: - self.vae.unfuse_qkv_projections() - self.fusing_vae = False - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index 2f9e6eb28f21..58c6f3ff6ebf 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -35,7 +35,6 @@ from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.attention_processor import ( AttnProcessor2_0, - FusedAttnProcessor2_0, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor, @@ -53,7 +52,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from .pipeline_output import StableDiffusionXLPipelineOutput @@ -166,6 +165,7 @@ def retrieve_timesteps( class StableDiffusionXLImg2ImgPipeline( DiffusionPipeline, + StableDiffusionMixin, TextualInversionLoaderMixin, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, @@ -278,39 +278,6 @@ def __init__( else: self.watermark = None - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt def encode_prompt( self, @@ -879,95 +846,6 @@ def upcast_vae(self): self.vae.decoder.conv_in.to(dtype) self.vae.decoder.mid_block.to(dtype) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections - def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - """ - self.fusing_unet = False - self.fusing_vae = False - - if unet: - self.fusing_unet = True - self.unet.fuse_qkv_projections() - self.unet.set_attn_processor(FusedAttnProcessor2_0()) - - if vae: - if not isinstance(self.vae, AutoencoderKL): - raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") - - self.fusing_vae = True - self.vae.fuse_qkv_projections() - self.vae.set_attn_processor(FusedAttnProcessor2_0()) - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections - def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """Disable QKV projection fusion if enabled. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - - """ - if unet: - if not self.fusing_unet: - logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") - else: - self.unet.unfuse_qkv_projections() - self.fusing_unet = False - - if vae: - if not self.fusing_vae: - logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") - else: - self.vae.unfuse_qkv_projections() - self.fusing_vae = False - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 3a0c494accd7..7c966c73acf8 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -36,7 +36,6 @@ from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.attention_processor import ( AttnProcessor2_0, - FusedAttnProcessor2_0, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor, @@ -54,7 +53,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from .pipeline_output import StableDiffusionXLPipelineOutput @@ -311,6 +310,7 @@ def retrieve_timesteps( class StableDiffusionXLInpaintPipeline( DiffusionPipeline, + StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, @@ -429,39 +429,6 @@ def __init__( else: self.watermark = None - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): dtype = next(self.image_encoder.parameters()).dtype @@ -1115,95 +1082,6 @@ def upcast_vae(self): self.vae.decoder.conv_in.to(dtype) self.vae.decoder.mid_block.to(dtype) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections - def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - """ - self.fusing_unet = False - self.fusing_vae = False - - if unet: - self.fusing_unet = True - self.unet.fuse_qkv_projections() - self.unet.set_attn_processor(FusedAttnProcessor2_0()) - - if vae: - if not isinstance(self.vae, AutoencoderKL): - raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") - - self.fusing_vae = True - self.vae.fuse_qkv_projections() - self.vae.set_attn_processor(FusedAttnProcessor2_0()) - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections - def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): - """Disable QKV projection fusion if enabled. - - - - This API is 🧪 experimental. - - - - Args: - unet (`bool`, defaults to `True`): To apply fusion on the UNet. - vae (`bool`, defaults to `True`): To apply fusion on the VAE. - - """ - if unet: - if not self.fusing_unet: - logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") - else: - self.unet.unfuse_qkv_projections() - self.fusing_unet = False - - if vae: - if not self.fusing_vae: - logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") - else: - self.vae.unfuse_qkv_projections() - self.fusing_vae = False - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py index 2e4225cf6145..b3327996263a 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py @@ -41,7 +41,7 @@ scale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from .pipeline_output import StableDiffusionXLPipelineOutput @@ -118,7 +118,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): class StableDiffusionXLInstructPix2PixPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + FromSingleFileMixin, + StableDiffusionXLLoraLoaderMixin, ): r""" Pipeline for pixel-level image editing by following text instructions. Based on Stable Diffusion XL. @@ -205,38 +209,6 @@ def __init__( else: self.watermark = None - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. - - When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several - steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. - - When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in - several steps. This is useful to save a large amount of memory and to allow the processing of larger images. - """ - self.vae.enable_tiling() - - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - def encode_prompt( self, prompt: str, @@ -621,34 +593,6 @@ def upcast_vae(self): self.vae.decoder.conv_in.to(dtype) self.vae.decoder.mid_block.to(dtype) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py index f5d3b66f326b..0b55bb38b5eb 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py @@ -37,7 +37,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -163,7 +163,7 @@ def retrieve_timesteps( return timesteps, num_inference_steps -class StableDiffusionAdapterPipeline(DiffusionPipeline): +class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin): r""" Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter https://arxiv.org/abs/2302.08453 @@ -248,22 +248,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, @@ -628,34 +612,6 @@ def _default_height_width(self, height, width, image): return height, width - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py index 0c812179dac1..96c7c6857c05 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py @@ -51,7 +51,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput @@ -181,6 +181,7 @@ def retrieve_timesteps( class StableDiffusionXLAdapterPipeline( DiffusionPipeline, + StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, IPAdapterMixin, @@ -270,39 +271,6 @@ def __init__( self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.default_sample_size = self.unet.config.sample_size - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt def encode_prompt( self, @@ -788,34 +756,6 @@ def _default_height_width(self, height, width, image): return height, width - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index eb34910b7008..0ed0765703f2 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -33,7 +33,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from . import TextToVideoSDPipelineOutput @@ -81,7 +81,7 @@ def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: return outputs -class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-video generation. @@ -129,39 +129,6 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, @@ -484,34 +451,6 @@ def prepare_latents( latents = latents * self.scheduler.init_noise_sigma return latents - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py index 2a41d9a8f735..40c486316e13 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -34,7 +34,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from . import TextToVideoSDPipelineOutput @@ -157,7 +157,7 @@ def preprocess_video(video): return video -class VideoToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-guided video-to-video generation. @@ -205,39 +205,6 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, @@ -591,34 +558,6 @@ def prepare_latents(self, video, timestep, batch_size, dtype, device, generator= return latents - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index fc34d50a50dd..408ae23f4d9f 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -17,7 +17,7 @@ from ...schedulers import KarrasDiffusionSchedulers from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from ..stable_diffusion import StableDiffusionSafetyChecker @@ -281,7 +281,7 @@ def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_s return warped_latents -class TextToVideoZeroPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for zero-shot text-to-video generation using Stable Diffusion. diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py index 4fe2279a468a..eaa2760363a9 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py @@ -37,7 +37,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor -from ..pipeline_utils import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin if is_invisible_watermark_available(): @@ -327,6 +327,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): class TextToVideoZeroSDXLPipeline( DiffusionPipeline, + StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin, ): @@ -436,22 +437,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae def upcast_vae(self): dtype = self.vae.dtype diff --git a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py index 38c12edb2d43..5d61b1054e1c 100644 --- a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +++ b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py @@ -135,39 +135,6 @@ def __init__( # TODO: handle safety checking? self.safety_checker = None - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature @@ -244,6 +211,39 @@ def _infer_mode(self, prompt, prompt_embeds, image, latents, prompt_latents, vae return mode + # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_slicing + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to + compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_slicing + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_tiling + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to + compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow + processing larger images. + """ + self.vae.enable_tiling() + + # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_tiling + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + # Functions to manually set the mode def set_text_mode(self): r"""Manually set the generation mode to unconditional ("marginal") text generation.""" diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py index 7372dc9d43b9..d4e13c3c3eb4 100644 --- a/src/diffusers/utils/dummy_pt_objects.py +++ b/src/diffusers/utils/dummy_pt_objects.py @@ -675,6 +675,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) +class StableDiffusionMixin(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class AmusedScheduler(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/pipelines/animatediff/test_animatediff.py b/tests/pipelines/animatediff/test_animatediff.py index 3b789e4ff0f3..288f856dc677 100644 --- a/tests/pipelines/animatediff/test_animatediff.py +++ b/tests/pipelines/animatediff/test_animatediff.py @@ -18,7 +18,7 @@ from diffusers.utils.testing_utils import numpy_cosine_similarity_distance, require_torch_gpu, slow, torch_device from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import IPAdapterTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import IPAdapterTesterMixin, PipelineTesterMixin, SDFunctionTesterMixin def to_np(tensor): @@ -28,7 +28,9 @@ def to_np(tensor): return tensor -class AnimateDiffPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, unittest.TestCase): +class AnimateDiffPipelineFastTests( + IPAdapterTesterMixin, SDFunctionTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = AnimateDiffPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS diff --git a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py index 004b06f160bd..aeda67174ad5 100644 --- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py +++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py @@ -46,14 +46,14 @@ torch_device, ) -from ..test_pipelines_common import PipelineTesterMixin +from ..test_pipelines_common import PipelineTesterMixin, SDFunctionTesterMixin enable_full_determinism() @skip_mps -class I2VGenXLPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class I2VGenXLPipelineFastTests(SDFunctionTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = I2VGenXLPipeline params = frozenset(["prompt", "negative_prompt", "image"]) batch_params = frozenset(["prompt", "negative_prompt", "image", "generator"]) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index dcc4dadf992b..7aef098916ca 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -52,14 +52,23 @@ TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS, ) -from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import ( + PipelineKarrasSchedulerTesterMixin, + PipelineLatentTesterMixin, + PipelineTesterMixin, + SDFunctionTesterMixin, +) enable_full_determinism() class StableDiffusion2PipelineFastTests( - PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase + SDFunctionTesterMixin, + PipelineLatentTesterMixin, + PipelineKarrasSchedulerTesterMixin, + PipelineTesterMixin, + unittest.TestCase, ): pipeline_class = StableDiffusionPipeline params = TEXT_TO_IMAGE_PARAMS diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py index 3c81328deac8..1b83e23513ab 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py @@ -53,6 +53,7 @@ IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin, + SDFunctionTesterMixin, SDXLOptionalComponentsTesterMixin, ) @@ -61,6 +62,7 @@ class StableDiffusionXLPipelineFastTests( + SDFunctionTesterMixin, IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin, @@ -948,37 +950,6 @@ def test_stable_diffusion_xl_save_from_pretrained(self): assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3 - def test_stable_diffusion_xl_with_fused_qkv_projections(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionXLPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - original_image_slice = image[0, -3:, -3:, -1] - - sd_pipe.fuse_qkv_projections() - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice_fused = image[0, -3:, -3:, -1] - - sd_pipe.unfuse_qkv_projections() - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice_disabled = image[0, -3:, -3:, -1] - - assert np.allclose( - original_image_slice, image_slice_fused, atol=1e-2, rtol=1e-2 - ), "Fusion of QKV projections shouldn't affect the outputs." - assert np.allclose( - image_slice_fused, image_slice_disabled, atol=1e-2, rtol=1e-2 - ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled." - assert np.allclose( - original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2 - ), "Original outputs should match when fused QKV projections are disabled." - def test_pipeline_interrupt(self): components = self.get_dummy_components() sd_pipe = StableDiffusionXLPipeline(**components) diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 3c439d9c7042..0ca464d3bd13 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -30,6 +30,10 @@ ) from diffusers.image_processor import VaeImageProcessor from diffusers.loaders import IPAdapterMixin +from diffusers.models.unets.unet_3d_condition import UNet3DConditionModel +from diffusers.models.unets.unet_i2vgen_xl import I2VGenXLUNet +from diffusers.models.unets.unet_motion_model import UNetMotionModel +from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import logging from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available @@ -61,6 +65,149 @@ def check_same_shape(tensor_list): return all(shape == shapes[0] for shape in shapes[1:]) +class SDFunctionTesterMixin: + """ + This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. + It provides a set of common tests for PyTorch pipeline that inherit from StableDiffusionMixin, e.g. vae_slicing, vae_tiling, freeu, etc. + """ + + def test_vae_slicing(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + # components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config) + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + image_count = 4 + + inputs = self.get_dummy_inputs(device) + inputs["prompt"] = [inputs["prompt"]] * image_count + if "image" in inputs: # fix batch size mismatch in I2V_Gen pipeline + inputs["image"] = [inputs["image"]] * image_count + output_1 = pipe(**inputs) + + # make sure sliced vae decode yields the same result + pipe.enable_vae_slicing() + inputs = self.get_dummy_inputs(device) + inputs["prompt"] = [inputs["prompt"]] * image_count + if "image" in inputs: + inputs["image"] = [inputs["image"]] * image_count + inputs["return_dict"] = False + output_2 = pipe(**inputs) + + assert np.abs(output_2[0].flatten() - output_1[0].flatten()).max() < 1e-2 + + def test_vae_tiling(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + + # make sure here that pndm scheduler skips prk + if "safety_checker" in components: + components["safety_checker"] = None + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(torch_device) + inputs["return_dict"] = False + + # Test that tiled decode at 512x512 yields the same result as the non-tiled decode + output_1 = pipe(**inputs)[0] + + # make sure tiled vae decode yields the same result + pipe.enable_vae_tiling() + inputs = self.get_dummy_inputs(torch_device) + inputs["return_dict"] = False + output_2 = pipe(**inputs)[0] + + assert np.abs(output_2 - output_1).max() < 5e-1 + + # test that tiled decode works with various shapes + shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)] + for shape in shapes: + zeros = torch.zeros(shape).to(device) + pipe.vae.decode(zeros) + + def test_freeu_enabled(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(torch_device) + inputs["return_dict"] = False + output = pipe(**inputs)[0] + + pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4) + inputs = self.get_dummy_inputs(torch_device) + inputs["return_dict"] = False + output_freeu = pipe(**inputs)[0] + + assert not np.allclose( + output[0, -3:, -3:, -1], output_freeu[0, -3:, -3:, -1] + ), "Enabling of FreeU should lead to different results." + + def test_freeu_disabled(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(torch_device) + inputs["return_dict"] = False + output = pipe(**inputs)[0] + + pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4) + pipe.disable_freeu() + + freeu_keys = {"s1", "s2", "b1", "b2"} + for upsample_block in pipe.unet.up_blocks: + for key in freeu_keys: + assert getattr(upsample_block, key) is None, f"Disabling of FreeU should have set {key} to None." + + inputs = self.get_dummy_inputs(torch_device) + inputs["return_dict"] = False + output_no_freeu = pipe(**inputs)[0] + assert np.allclose( + output, output_no_freeu, atol=1e-2 + ), f"Disabling of FreeU should lead to results similar to the default pipeline results but Max Abs Error={np.abs(output_no_freeu - output).max()}." + + def test_fused_qkv_projections(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + inputs["return_dict"] = False + image = pipe(**inputs)[0] + original_image_slice = image[0, -3:, -3:, -1] + + pipe.fuse_qkv_projections() + inputs = self.get_dummy_inputs(device) + inputs["return_dict"] = False + image_fused = pipe(**inputs)[0] + image_slice_fused = image_fused[0, -3:, -3:, -1] + + pipe.unfuse_qkv_projections() + inputs = self.get_dummy_inputs(device) + inputs["return_dict"] = False + image_disabled = pipe(**inputs)[0] + image_slice_disabled = image_disabled[0, -3:, -3:, -1] + + assert np.allclose( + original_image_slice, image_slice_fused, atol=1e-2, rtol=1e-2 + ), "Fusion of QKV projections shouldn't affect the outputs." + assert np.allclose( + image_slice_fused, image_slice_disabled, atol=1e-2, rtol=1e-2 + ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled." + assert np.allclose( + original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2 + ), "Original outputs should match when fused QKV projections are disabled." + + class IPAdapterTesterMixin: """ This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. @@ -1137,6 +1284,18 @@ def callback_increase_guidance(pipe, i, t, callback_kwargs): # accounts for models that modify the number of inference steps based on strength assert pipe.guidance_scale == (inputs["guidance_scale"] + pipe.num_timesteps) + def test_StableDiffusionMixin_component(self): + """Any pipeline that have LDMFuncMixin should have vae and unet components.""" + if not issubclass(self.pipeline_class, StableDiffusionMixin): + return + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + self.assertTrue(hasattr(pipe, "vae") and isinstance(pipe.vae, (AutoencoderKL, AutoencoderTiny))) + self.assertTrue( + hasattr(pipe, "unet") + and isinstance(pipe.unet, (UNet2DConditionModel, UNet3DConditionModel, I2VGenXLUNet, UNetMotionModel)) + ) + @is_staging_test class PipelinePushToHubTester(unittest.TestCase): diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py index d988350505a8..9dc48011d2f1 100644 --- a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py +++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py @@ -37,14 +37,14 @@ ) from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineTesterMixin +from ..test_pipelines_common import PipelineTesterMixin, SDFunctionTesterMixin enable_full_determinism() @skip_mps -class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class TextToVideoSDPipelineFastTests(PipelineTesterMixin, SDFunctionTesterMixin, unittest.TestCase): pipeline_class = TextToVideoSDPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS From abd922bd0c43a504e47eca2ed354c3634bd00834 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 29 Feb 2024 02:09:01 +0530 Subject: [PATCH 05/28] [docs] unet type hints (#7134) update --- .../models/unets/unet_2d_condition.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py index fee7a34fb216..d067767315f1 100644 --- a/src/diffusers/models/unets/unet_2d_condition.py +++ b/src/diffusers/models/unets/unet_2d_condition.py @@ -204,7 +204,7 @@ def __init__( upcast_attention: bool = False, resnet_time_scale_shift: str = "default", resnet_skip_time_act: bool = False, - resnet_out_scale_factor: int = 1.0, + resnet_out_scale_factor: float = 1.0, time_embedding_type: str = "positional", time_embedding_dim: Optional[int] = None, time_embedding_act_fn: Optional[str] = None, @@ -217,7 +217,7 @@ def __init__( class_embeddings_concat: bool = False, mid_block_only_cross_attention: Optional[bool] = None, cross_attention_norm: Optional[str] = None, - addition_embed_type_num_heads=64, + addition_embed_type_num_heads: int = 64, ): super().__init__() @@ -485,9 +485,9 @@ def _check_config( up_block_types: Tuple[str], only_cross_attention: Union[bool, Tuple[bool]], block_out_channels: Tuple[int], - layers_per_block: [int, Tuple[int]], + layers_per_block: Union[int, Tuple[int]], cross_attention_dim: Union[int, Tuple[int]], - transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]], + transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]], reverse_transformer_layers_per_block: bool, attention_head_dim: int, num_attention_heads: Optional[Union[int, Tuple[int]]], @@ -762,7 +762,7 @@ def set_default_attn_processor(self): self.set_attn_processor(processor) - def set_attention_slice(self, slice_size): + def set_attention_slice(self, slice_size: Union[str, int, List[int]] = "auto"): r""" Enable sliced attention computation. @@ -831,7 +831,7 @@ def _set_gradient_checkpointing(self, module, value=False): if hasattr(module, "gradient_checkpointing"): module.gradient_checkpointing = value - def enable_freeu(self, s1, s2, b1, b2): + def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497. The suffixes after the scaling factors represent the stage blocks where they are being applied. @@ -953,7 +953,7 @@ def get_class_embed(self, sample: torch.Tensor, class_labels: Optional[torch.Ten return class_emb def get_aug_embed( - self, emb: torch.Tensor, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict + self, emb: torch.Tensor, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict[str, Any] ) -> Optional[torch.Tensor]: aug_emb = None if self.config.addition_embed_type == "text": @@ -1004,7 +1004,9 @@ def get_aug_embed( aug_emb = self.add_embedding(image_embs, hint) return aug_emb - def process_encoder_hidden_states(self, encoder_hidden_states: torch.Tensor, added_cond_kwargs) -> torch.Tensor: + def process_encoder_hidden_states( + self, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict[str, Any] + ) -> torch.Tensor: if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj": encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states) elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj": From 3daebe2b445b7920281a6371918c979156969d7d Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 29 Feb 2024 08:27:24 +0530 Subject: [PATCH 06/28] use uv for installing stuff in the workflows. (#7116) * use uv for installing stuff in the workflows. * fix: from source installation command when using uv. * fix uv venv issue * edit editable installation. * fix quality installation * checking * make editable. * more check * check * add: export step * venv handling. * checking. * fix: dependency workflows. * peft tests. * proper way to initialize env. * Empty-Commit * Empty-Commit --- .github/workflows/benchmark.yml | 5 ++- .github/workflows/nightly_tests.yml | 18 +++++--- .github/workflows/pr_dependency_test.yml | 8 ++-- .github/workflows/pr_flax_dependency_test.yml | 14 +++--- .github/workflows/pr_test_fetcher.yml | 13 ++++-- .github/workflows/pr_test_peft_backend.yml | 13 +++--- .github/workflows/pr_tests.yml | 17 ++++++-- .../workflows/pr_torch_dependency_test.yml | 10 +++-- .github/workflows/push_tests.yml | 43 ++++++++++++------- .github/workflows/push_tests_fast.yml | 10 ++++- .github/workflows/push_tests_mps.yml | 10 ++--- 11 files changed, 104 insertions(+), 57 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index a3f530d51b7b..b3ec74d8c15b 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -31,8 +31,9 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] - python -m pip install pandas peft + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] + python -m uv pip install pandas peft - name: Environment run: | python utils/print_env.py diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index fb0ce92cb61c..2019ea154224 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -60,9 +60,10 @@ jobs: - name: Install dependencies run: | - python -m pip install -e .[quality,test] - python -m pip install -U git+https://github.com/huggingface/transformers - python -m pip install git+https://github.com/huggingface/accelerate + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] + python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers + python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate - name: Environment run: | @@ -73,6 +74,7 @@ jobs: env: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ -s -v -k "not Flax and not Onnx" \ --make-reports=tests_${{ matrix.config.report }} \ @@ -83,6 +85,7 @@ jobs: env: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pytest -n 0 \ -s -v -k "Flax" \ --make-reports=tests_${{ matrix.config.report }} \ @@ -93,6 +96,7 @@ jobs: env: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ -s -v -k "Onnx" \ --make-reports=tests_${{ matrix.config.report }} \ @@ -132,10 +136,10 @@ jobs: - name: Install dependencies shell: arch -arch arm64 bash {0} run: | - ${CONDA_RUN} python -m pip install --upgrade pip - ${CONDA_RUN} python -m pip install -e .[quality,test] - ${CONDA_RUN} python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu - ${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate + ${CONDA_RUN} python -m pip install --upgrade pip uv + ${CONDA_RUN} python -m uv pip install -e [quality,test] + ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu + ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate - name: Environment shell: arch -arch arm64 bash {0} diff --git a/.github/workflows/pr_dependency_test.yml b/.github/workflows/pr_dependency_test.yml index 0839497ed732..f21f09ef875e 100644 --- a/.github/workflows/pr_dependency_test.yml +++ b/.github/workflows/pr_dependency_test.yml @@ -25,10 +25,12 @@ jobs: python-version: "3.8" - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -e . - pip install pytest + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m pip install --upgrade pip uv + python -m uv pip install -e . + python -m uv pip install pytest - name: Check for soft dependencies run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" pytest tests/others/test_dependencies.py \ No newline at end of file diff --git a/.github/workflows/pr_flax_dependency_test.yml b/.github/workflows/pr_flax_dependency_test.yml index e905d94c250f..bbad72929917 100644 --- a/.github/workflows/pr_flax_dependency_test.yml +++ b/.github/workflows/pr_flax_dependency_test.yml @@ -25,12 +25,14 @@ jobs: python-version: "3.8" - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -e . - pip install "jax[cpu]>=0.2.16,!=0.3.2" - pip install "flax>=0.4.1" - pip install "jaxlib>=0.1.65" - pip install pytest + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m pip install --upgrade pip uv + python -m uv pip install -e . + python -m uv pip install "jax[cpu]>=0.2.16,!=0.3.2" + python -m uv pip install "flax>=0.4.1" + python -m uv pip install "jaxlib>=0.1.65" + python -m uv pip install pytest - name: Check for soft dependencies run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" pytest tests/others/test_dependencies.py diff --git a/.github/workflows/pr_test_fetcher.yml b/.github/workflows/pr_test_fetcher.yml index b82f8007aa74..6a7da2cb73d0 100644 --- a/.github/workflows/pr_test_fetcher.yml +++ b/.github/workflows/pr_test_fetcher.yml @@ -33,7 +33,8 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] - name: Environment run: | python utils/print_env.py @@ -89,15 +90,18 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m pip install -e [quality,test] python -m pip install accelerate - name: Environment run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python utils/print_env.py - name: Run all selected tests on CPU run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.modules }}_tests_cpu ${{ fromJson(needs.setup_pr_tests.outputs.test_map)[matrix.modules] }} - name: Failure short reports @@ -144,15 +148,18 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m pip install -e [quality,test] - name: Environment run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python utils/print_env.py - name: Run Hub tests for models, schedulers, and pipelines on a staging env if: ${{ matrix.config.framework == 'hub_tests_pytorch' }} run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" HUGGINGFACE_CO_STAGING=true python -m pytest \ -m "is_staging_test" \ --make-reports=tests_${{ matrix.config.report }} \ diff --git a/.github/workflows/pr_test_peft_backend.yml b/.github/workflows/pr_test_peft_backend.yml index c93e8bef40cf..b9fb06d78124 100644 --- a/.github/workflows/pr_test_peft_backend.yml +++ b/.github/workflows/pr_test_peft_backend.yml @@ -84,21 +84,24 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] if [ "${{ matrix.lib-versions }}" == "main" ]; then - python -m pip install -U git+https://github.com/huggingface/peft.git - python -m pip install -U git+https://github.com/huggingface/transformers.git - python -m pip install -U git+https://github.com/huggingface/accelerate.git + python -m uv pip install -U peft@git+https://github.com/huggingface/peft.git + python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git + python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git else - python -m pip install -U peft transformers accelerate + python -m uv pip install -U peft transformers accelerate fi - name: Environment run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python utils/print_env.py - name: Run fast PyTorch LoRA CPU tests with PEFT backend run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ -s -v \ --make-reports=tests_${{ matrix.config.report }} \ diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index d21232203479..d196bb7ff445 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -111,16 +111,19 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] - python -m pip install accelerate + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] + python -m uv pip install accelerate - name: Environment run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python utils/print_env.py - name: Run fast PyTorch Pipeline CPU tests if: ${{ matrix.config.framework == 'pytorch_pipelines' }} run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ -s -v -k "not Flax and not Onnx" \ --make-reports=tests_${{ matrix.config.report }} \ @@ -129,6 +132,7 @@ jobs: - name: Run fast PyTorch Model Scheduler CPU tests if: ${{ matrix.config.framework == 'pytorch_models' }} run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ -s -v -k "not Flax and not Onnx and not Dependency" \ --make-reports=tests_${{ matrix.config.report }} \ @@ -137,6 +141,7 @@ jobs: - name: Run fast Flax TPU tests if: ${{ matrix.config.framework == 'flax' }} run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ -s -v -k "Flax" \ --make-reports=tests_${{ matrix.config.report }} \ @@ -145,7 +150,8 @@ jobs: - name: Run example PyTorch CPU tests if: ${{ matrix.config.framework == 'pytorch_examples' }} run: | - python -m pip install peft + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install peft python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ --make-reports=tests_${{ matrix.config.report }} \ examples @@ -194,15 +200,18 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] - name: Environment run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python utils/print_env.py - name: Run Hub tests for models, schedulers, and pipelines on a staging env if: ${{ matrix.config.framework == 'hub_tests_pytorch' }} run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" HUGGINGFACE_CO_STAGING=true python -m pytest \ -m "is_staging_test" \ --make-reports=tests_${{ matrix.config.report }} \ diff --git a/.github/workflows/pr_torch_dependency_test.yml b/.github/workflows/pr_torch_dependency_test.yml index c8aeb4b2a6fc..16a7724fe744 100644 --- a/.github/workflows/pr_torch_dependency_test.yml +++ b/.github/workflows/pr_torch_dependency_test.yml @@ -25,10 +25,12 @@ jobs: python-version: "3.8" - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -e . - pip install torch torchvision torchaudio - pip install pytest + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m pip install --upgrade pip uv + python -m uv pip install -e . + python -m uv pip install torch torchvision torchaudio + python -m uv pip install pytest - name: Check for soft dependencies run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" pytest tests/others/test_dependencies.py diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index 113b89e6abb7..4b6d5bfc3733 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -35,8 +35,9 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] - python -m pip install git+https://github.com/huggingface/accelerate.git + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] + python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git - name: Environment run: | @@ -79,8 +80,9 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] - python -m pip install git+https://github.com/huggingface/accelerate.git + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] + python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git - name: Environment run: | python utils/print_env.py @@ -128,8 +130,9 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] - python -m pip install git+https://github.com/huggingface/accelerate.git + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] + python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git - name: Environment run: | @@ -177,9 +180,10 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] - python -m pip install git+https://github.com/huggingface/accelerate.git - python -m pip install git+https://github.com/huggingface/peft.git + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] + python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git + python -m uv pip install peft@git+https://github.com/huggingface/peft.git - name: Environment run: | @@ -227,8 +231,9 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] - python -m pip install git+https://github.com/huggingface/accelerate.git + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] + python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git - name: Environment run: | @@ -274,8 +279,9 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] - python -m pip install git+https://github.com/huggingface/accelerate.git + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] + python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git - name: Environment run: | @@ -323,7 +329,8 @@ jobs: nvidia-smi - name: Install dependencies run: | - python -m pip install -e .[quality,test,training] + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test,training] - name: Environment run: | python utils/print_env.py @@ -363,7 +370,8 @@ jobs: nvidia-smi - name: Install dependencies run: | - python -m pip install -e .[quality,test,training] + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test,training] - name: Environment run: | python utils/print_env.py @@ -404,16 +412,19 @@ jobs: - name: Install dependencies run: | - python -m pip install -e .[quality,test,training] + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test,training] - name: Environment run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python utils/print_env.py - name: Run example tests on GPU env: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/ - name: Failure short reports diff --git a/.github/workflows/push_tests_fast.yml b/.github/workflows/push_tests_fast.yml index 664c9732290b..6b01577041b2 100644 --- a/.github/workflows/push_tests_fast.yml +++ b/.github/workflows/push_tests_fast.yml @@ -69,15 +69,18 @@ jobs: - name: Install dependencies run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y - python -m pip install -e .[quality,test] + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] - name: Environment run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python utils/print_env.py - name: Run fast PyTorch CPU tests if: ${{ matrix.config.framework == 'pytorch' }} run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ -s -v -k "not Flax and not Onnx" \ --make-reports=tests_${{ matrix.config.report }} \ @@ -86,6 +89,7 @@ jobs: - name: Run fast Flax TPU tests if: ${{ matrix.config.framework == 'flax' }} run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ -s -v -k "Flax" \ --make-reports=tests_${{ matrix.config.report }} \ @@ -94,6 +98,7 @@ jobs: - name: Run fast ONNXRuntime CPU tests if: ${{ matrix.config.framework == 'onnxruntime' }} run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ -s -v -k "Onnx" \ --make-reports=tests_${{ matrix.config.report }} \ @@ -102,7 +107,8 @@ jobs: - name: Run example PyTorch CPU tests if: ${{ matrix.config.framework == 'pytorch_examples' }} run: | - python -m pip install peft + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install peft python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ --make-reports=tests_${{ matrix.config.report }} \ examples diff --git a/.github/workflows/push_tests_mps.yml b/.github/workflows/push_tests_mps.yml index 76c9a0f9663e..3a14f856346b 100644 --- a/.github/workflows/push_tests_mps.yml +++ b/.github/workflows/push_tests_mps.yml @@ -44,11 +44,11 @@ jobs: - name: Install dependencies shell: arch -arch arm64 bash {0} run: | - ${CONDA_RUN} python -m pip install --upgrade pip - ${CONDA_RUN} python -m pip install -e .[quality,test] - ${CONDA_RUN} python -m pip install torch torchvision torchaudio - ${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate.git - ${CONDA_RUN} python -m pip install transformers --upgrade + ${CONDA_RUN} python -m pip install --upgrade pip uv + ${CONDA_RUN} python -m uv pip install -e [quality,test] + ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio + ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git + ${CONDA_RUN} python -m uv pip install transformers --upgrade - name: Environment shell: arch -arch arm64 bash {0} From 56dac1cedcb55f4647874fa1b5e600ae67748f08 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 29 Feb 2024 19:01:54 +0530 Subject: [PATCH 07/28] limit documentation workflow runs for relevant changes. (#7125) --- .github/workflows/build_documentation.yml | 4 ++++ .github/workflows/build_pr_documentation.yml | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 05ea1b2122de..17d65b07e96e 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -7,6 +7,10 @@ on: - doc-builder* - v*-release - v*-patch + paths: + - "src/diffusers/**.py" + - "examples/" + - "docs/" jobs: build: diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 33f09b309d49..267381fd64e7 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -2,6 +2,10 @@ name: Build PR Documentation on: pull_request: + paths: + - "src/diffusers/**.py" + - "examples/" + - "docs/" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} From fcfa270fbd1dc294e2f3a505bae6bcb791d721c3 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 29 Feb 2024 19:28:52 +0530 Subject: [PATCH 08/28] add: support for notifying the maintainers about the docker ci status. (#7113) --- .github/workflows/build_docker_images.yml | 25 +++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml index 937ad07496b9..4179eb171cef 100644 --- a/.github/workflows/build_docker_images.yml +++ b/.github/workflows/build_docker_images.yml @@ -11,6 +11,7 @@ concurrency: env: REGISTRY: diffusers + CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }} jobs: build-docker-images: @@ -50,3 +51,27 @@ jobs: context: ./docker/${{ matrix.image-name }} push: true tags: ${{ env.REGISTRY }}/${{ matrix.image-name }}:latest + + - name: Post to a Slack channel + id: slack + uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 + with: + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + channel-id: ${{ env.CI_SLACK_CHANNEL }} + # For posting a rich message using Block Kit + payload: | + { + "text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}", + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}" + } + } + ] + } + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} From 8f2d13c684e282b258e5fbc73744eda4dc5f3cb6 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 29 Feb 2024 22:47:39 +0530 Subject: [PATCH 09/28] Fix setting fp16 dtype in AnimateDiff convert script. (#7127) * update * update --- scripts/convert_animatediff_motion_module_to_diffusers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/convert_animatediff_motion_module_to_diffusers.py b/scripts/convert_animatediff_motion_module_to_diffusers.py index 9c5d236fd713..ceb967acd3d6 100644 --- a/scripts/convert_animatediff_motion_module_to_diffusers.py +++ b/scripts/convert_animatediff_motion_module_to_diffusers.py @@ -30,6 +30,7 @@ def get_args(): parser.add_argument("--output_path", type=str, required=True) parser.add_argument("--use_motion_mid_block", action="store_true") parser.add_argument("--motion_max_seq_length", type=int, default=32) + parser.add_argument("--save_fp16", action="store_true") return parser.parse_args() @@ -48,4 +49,6 @@ def get_args(): # skip loading position embeddings adapter.load_state_dict(conv_state_dict, strict=False) adapter.save_pretrained(args.output_path) - adapter.save_pretrained(args.output_path, variant="fp16", torch_dtype=torch.float16) + + if args.save_fp16: + adapter.to(torch.float16).save_pretrained(args.output_path, variant="fp16") From f4fc75035f2d567dfe00fc6b8671ecd38dd144b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Fri, 1 Mar 2024 00:03:01 +0300 Subject: [PATCH 10/28] [`Docs`] Fix typos (#7131) * Add copyright notice to relevant files and fix typos * Set `timestep_spacing` parameter of `StableDiffusionXLPipeline`'s scheduler to `'trailing'`. * Update `StableDiffusionXLPipeline.from_single_file` by including EulerAncestralDiscreteScheduler with `timestep_spacing="trailing"` param. * Update model loading method in SDXL Turbo documentation --- .../en/api/models/consistency_decoder_vae.md | 14 ++++++++++++- .../pipelines/stable_diffusion/sdxl_turbo.md | 2 +- .../en/api/schedulers/consistency_decoder.md | 16 +++++++++++++-- docs/source/en/using-diffusers/sdxl_turbo.md | 20 ++++++++++--------- 4 files changed, 39 insertions(+), 13 deletions(-) diff --git a/docs/source/en/api/models/consistency_decoder_vae.md b/docs/source/en/api/models/consistency_decoder_vae.md index b45f7fa059dc..94a64820ebb1 100644 --- a/docs/source/en/api/models/consistency_decoder_vae.md +++ b/docs/source/en/api/models/consistency_decoder_vae.md @@ -1,6 +1,18 @@ + + # Consistency Decoder -Consistency decoder can be used to decode the latents from the denoising UNet in the [`StableDiffusionPipeline`]. This decoder was introduced in the [DALL-E 3 technical report](https://openai.com/dall-e-3). +Consistency decoder can be used to decode the latents from the denoising UNet in the [`StableDiffusionPipeline`]. This decoder was introduced in the [DALL-E 3 technical report](https://openai.com/dall-e-3). The original codebase can be found at [openai/consistencydecoder](https://github.com/openai/consistencydecoder). diff --git a/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md b/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md index 935de29c9619..764685a73cfb 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md +++ b/docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md @@ -21,7 +21,7 @@ The abstract from the paper is: ## Tips - SDXL Turbo uses the exact same architecture as [SDXL](./stable_diffusion_xl), which means it also has the same API. Please refer to the [SDXL](./stable_diffusion_xl) API reference for more details. -- SDXL Turbo should disable guidance scale by setting `guidance_scale=0.0` +- SDXL Turbo should disable guidance scale by setting `guidance_scale=0.0`. - SDXL Turbo should use `timestep_spacing='trailing'` for the scheduler and use between 1 and 4 steps. - SDXL Turbo has been trained to generate images of size 512x512. - SDXL Turbo is open-access, but not open-source meaning that one might have to buy a model license in order to use it for commercial applications. Make sure to read the [official model card](https://huggingface.co/stabilityai/sdxl-turbo) to learn more. diff --git a/docs/source/en/api/schedulers/consistency_decoder.md b/docs/source/en/api/schedulers/consistency_decoder.md index 6c937b913279..a9eaa5336dcd 100644 --- a/docs/source/en/api/schedulers/consistency_decoder.md +++ b/docs/source/en/api/schedulers/consistency_decoder.md @@ -1,9 +1,21 @@ + + # ConsistencyDecoderScheduler -This scheduler is a part of the [`ConsistencyDecoderPipeline`] and was introduced in [DALL-E 3](https://openai.com/dall-e-3). +This scheduler is a part of the [`ConsistencyDecoderPipeline`] and was introduced in [DALL-E 3](https://openai.com/dall-e-3). The original codebase can be found at [openai/consistency_models](https://github.com/openai/consistency_models). ## ConsistencyDecoderScheduler -[[autodoc]] schedulers.scheduling_consistency_decoder.ConsistencyDecoderScheduler \ No newline at end of file +[[autodoc]] schedulers.scheduling_consistency_decoder.ConsistencyDecoderScheduler diff --git a/docs/source/en/using-diffusers/sdxl_turbo.md b/docs/source/en/using-diffusers/sdxl_turbo.md index 4e960c320c97..9ec0e94ec1f6 100644 --- a/docs/source/en/using-diffusers/sdxl_turbo.md +++ b/docs/source/en/using-diffusers/sdxl_turbo.md @@ -31,29 +31,31 @@ Before you begin, make sure you have the following libraries installed: Model weights may be stored in separate subfolders on the Hub or locally, in which case, you should use the [`~StableDiffusionXLPipeline.from_pretrained`] method: ```py -from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image +from diffusers import AutoPipelineForText2Image import torch pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16") pipeline = pipeline.to("cuda") ``` -You can also use the [`~StableDiffusionXLPipeline.from_single_file`] method to load a model checkpoint stored in a single file format (`.ckpt` or `.safetensors`) from the Hub or locally: +You can also use the [`~StableDiffusionXLPipeline.from_single_file`] method to load a model checkpoint stored in a single file format (`.ckpt` or `.safetensors`) from the Hub or locally. For this loading method, you need to set `timestep_spacing="trailing"` (feel free to experiment with the other scheduler config values to get better results): ```py -from diffusers import StableDiffusionXLPipeline +from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler import torch pipeline = StableDiffusionXLPipeline.from_single_file( - "https://huggingface.co/stabilityai/sdxl-turbo/blob/main/sd_xl_turbo_1.0_fp16.safetensors", torch_dtype=torch.float16) + "https://huggingface.co/stabilityai/sdxl-turbo/blob/main/sd_xl_turbo_1.0_fp16.safetensors", + torch_dtype=torch.float16, variant="fp16") pipeline = pipeline.to("cuda") +pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config, timestep_spacing="trailing") ``` ## Text-to-image For text-to-image, pass a text prompt. By default, SDXL Turbo generates a 512x512 image, and that resolution gives the best results. You can try setting the `height` and `width` parameters to 768x768 or 1024x1024, but you should expect quality degradations when doing so. -Make sure to set `guidance_scale` to 0.0 to disable, as the model was trained without it. A single inference step is enough to generate high quality images. +Make sure to set `guidance_scale` to 0.0 to disable, as the model was trained without it. A single inference step is enough to generate high quality images. Increasing the number of steps to 2, 3 or 4 should improve image quality. ```py @@ -75,7 +77,7 @@ image ## Image-to-image -For image-to-image generation, make sure that `num_inference_steps * strength` is larger or equal to 1. +For image-to-image generation, make sure that `num_inference_steps * strength` is larger or equal to 1. The image-to-image pipeline will run for `int(num_inference_steps * strength)` steps, e.g. `0.5 * 2.0 = 1` step in our example below. @@ -84,14 +86,14 @@ from diffusers import AutoPipelineForImage2Image from diffusers.utils import load_image, make_image_grid # use from_pipe to avoid consuming additional memory when loading a checkpoint -pipeline = AutoPipelineForImage2Image.from_pipe(pipeline_text2image).to("cuda") +pipeline_image2image = AutoPipelineForImage2Image.from_pipe(pipeline_text2image).to("cuda") init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png") init_image = init_image.resize((512, 512)) prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k" -image = pipeline(prompt, image=init_image, strength=0.5, guidance_scale=0.0, num_inference_steps=2).images[0] +image = pipeline_image2image(prompt, image=init_image, strength=0.5, guidance_scale=0.0, num_inference_steps=2).images[0] make_image_grid([init_image, image], rows=1, cols=2) ``` @@ -101,7 +103,7 @@ make_image_grid([init_image, image], rows=1, cols=2) ## Speed-up SDXL Turbo even more -- Compile the UNet if you are using PyTorch version 2 or better. The first inference run will be very slow, but subsequent ones will be much faster. +- Compile the UNet if you are using PyTorch version 2.0 or higher. The first inference run will be very slow, but subsequent ones will be much faster. ```py pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) From 06b01ea87e219c072e92db91dfda5f8f423b284a Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Thu, 29 Feb 2024 15:38:49 -1000 Subject: [PATCH 11/28] [ip-adapter] refactor `prepare_ip_adapter_image_embeds` and skip load `image_encoder` (#7016) * add Co-authored-by: Sayak Paul --------- Co-authored-by: yiyixuxu Co-authored-by: Sayak Paul --- docs/source/en/using-diffusers/ip_adapter.md | 33 +++++++++++ .../pipeline_animatediff_controlnet.py | 6 +- .../pipeline_animatediff_img2video.py | 6 +- src/diffusers/loaders/ip_adapter.py | 51 ++++++++++++----- .../animatediff/pipeline_animatediff.py | 38 +++++++++++-- .../pipeline_animatediff_video2video.py | 47 ++++++++++++++-- .../controlnet/pipeline_controlnet.py | 38 +++++++++++-- .../controlnet/pipeline_controlnet_img2img.py | 38 +++++++++++-- .../controlnet/pipeline_controlnet_inpaint.py | 38 +++++++++++-- .../pipeline_controlnet_inpaint_sd_xl.py | 38 +++++++++++-- .../controlnet/pipeline_controlnet_sd_xl.py | 38 +++++++++++-- .../pipeline_controlnet_sd_xl_img2img.py | 38 +++++++++++-- .../pipeline_latent_consistency_img2img.py | 38 +++++++++++-- .../pipeline_latent_consistency_text2img.py | 56 ++++++++++++++++--- src/diffusers/pipelines/pia/pipeline_pia.py | 38 +++++++++++-- .../pipeline_stable_diffusion.py | 38 +++++++++++-- .../pipeline_stable_diffusion_img2img.py | 38 +++++++++++-- .../pipeline_stable_diffusion_inpaint.py | 38 +++++++++++-- .../pipeline_stable_diffusion_ldm3d.py | 33 +++++++++-- .../pipeline_stable_diffusion_panorama.py | 38 +++++++++++-- .../pipeline_stable_diffusion_xl.py | 38 +++++++++++-- .../pipeline_stable_diffusion_xl_img2img.py | 38 +++++++++++-- .../pipeline_stable_diffusion_xl_inpaint.py | 38 +++++++++++-- .../pipeline_stable_diffusion_xl_adapter.py | 38 +++++++++++-- tests/pipelines/test_pipelines_common.py | 29 ++++++++++ 25 files changed, 769 insertions(+), 138 deletions(-) diff --git a/docs/source/en/using-diffusers/ip_adapter.md b/docs/source/en/using-diffusers/ip_adapter.md index fdb45c5b637c..38f7b12398df 100644 --- a/docs/source/en/using-diffusers/ip_adapter.md +++ b/docs/source/en/using-diffusers/ip_adapter.md @@ -234,6 +234,39 @@ export_to_gif(frames, "gummy_bear.gif") > [!TIP] > While calling `load_ip_adapter()`, pass `low_cpu_mem_usage=True` to speed up the loading time. +All the pipelines supporting IP-Adapter accept a `ip_adapter_image_embeds` argument. If you need to run the IP-Adapter multiple times with the same image, you can encode the image once and save the embedding to the disk. + +```py +image_embeds = pipeline.prepare_ip_adapter_image_embeds( + ip_adapter_image=image, + ip_adapter_image_embeds=None, + device="cuda", + num_images_per_prompt=1, + do_classifier_free_guidance=True, +) + +torch.save(image_embeds, "image_embeds.ipadpt") +``` + +Load the image embedding and pass it to the pipeline as `ip_adapter_image_embeds` + +> [!TIP] +> ComfyUI image embeddings for IP-Adapters are fully compatible in Diffusers and should work out-of-box. + +```py +image_embeds = torch.load("image_embeds.ipadpt") +images = pipeline( + prompt="a polar bear sitting in a chair drinking a milkshake", + ip_adapter_image_embeds=image_embeds, + negative_prompt="deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality", + num_inference_steps=100, + generator=generator, +).images +``` + +> [!TIP] +> If you use IP-Adapter with `ip_adapter_image_embedding` instead of `ip_adapter_image`, you can choose not to load an image encoder by passing `image_encoder_folder=None` to `load_ip_adapter()`. + ## Specific use cases IP-Adapter's image prompting and compatibility with other adapters and models makes it a versatile tool for a variety of use cases. This section covers some of the more popular applications of IP-Adapter, and we can't wait to see what you come up with! diff --git a/examples/community/pipeline_animatediff_controlnet.py b/examples/community/pipeline_animatediff_controlnet.py index 44fc1bae6f7f..56dd856e1811 100644 --- a/examples/community/pipeline_animatediff_controlnet.py +++ b/examples/community/pipeline_animatediff_controlnet.py @@ -799,8 +799,10 @@ def __call__( ip_adapter_image (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. conditioning_frames (`List[PipelineImageInput]`, *optional*): The ControlNet input condition to provide guidance to the `unet` for generation. If multiple ControlNets are specified, images must be passed as a list such that each element of the list can be correctly diff --git a/examples/community/pipeline_animatediff_img2video.py b/examples/community/pipeline_animatediff_img2video.py index 982bed08140b..fecb7211c7b9 100644 --- a/examples/community/pipeline_animatediff_img2video.py +++ b/examples/community/pipeline_animatediff_img2video.py @@ -798,8 +798,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or `np.array`. diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index d6bf50e51eee..2b70ed84d7ed 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -13,7 +13,7 @@ # limitations under the License. from pathlib import Path -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union import torch from huggingface_hub.utils import validate_hf_hub_args @@ -52,11 +52,12 @@ def load_ip_adapter( pretrained_model_name_or_path_or_dict: Union[str, List[str], Dict[str, torch.Tensor]], subfolder: Union[str, List[str]], weight_name: Union[str, List[str]], + image_encoder_folder: Optional[str] = "image_encoder", **kwargs, ): """ Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): + pretrained_model_name_or_path_or_dict (`str` or `List[str]` or `os.PathLike` or `List[os.PathLike]` or `dict` or `List[dict]`): Can be either: - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on @@ -65,7 +66,18 @@ def load_ip_adapter( with [`ModelMixin.save_pretrained`]. - A [torch state dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - + subfolder (`str` or `List[str]`): + The subfolder location of a model file within a larger model repository on the Hub or locally. + If a list is passed, it should have the same length as `weight_name`. + weight_name (`str` or `List[str]`): + The name of the weight file to load. If a list is passed, it should have the same length as + `weight_name`. + image_encoder_folder (`str`, *optional*, defaults to `image_encoder`): + The subfolder location of the image encoder within a larger model repository on the Hub or locally. + Pass `None` to not load the image encoder. If the image encoder is located in a folder inside `subfolder`, + you only need to pass the name of the folder that contains image encoder weights, e.g. `image_encoder_folder="image_encoder"`. + If the image encoder is located in a folder other than `subfolder`, you should pass the path to the folder that contains image encoder weights, + for example, `image_encoder_folder="different_subfolder/image_encoder"`. cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. @@ -87,8 +99,6 @@ def load_ip_adapter( revision (`str`, *optional*, defaults to `"main"`): The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`): Speed up model loading only loading the pretrained weights and not initializing the weights. This also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. @@ -184,16 +194,29 @@ def load_ip_adapter( # load CLIP image encoder here if it has not been registered to the pipeline yet if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None: - if not isinstance(pretrained_model_name_or_path_or_dict, dict): - logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}") - image_encoder = CLIPVisionModelWithProjection.from_pretrained( - pretrained_model_name_or_path_or_dict, - subfolder=Path(subfolder, "image_encoder").as_posix(), - low_cpu_mem_usage=low_cpu_mem_usage, - ).to(self.device, dtype=self.dtype) - self.register_modules(image_encoder=image_encoder) + if image_encoder_folder is not None: + if not isinstance(pretrained_model_name_or_path_or_dict, dict): + logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}") + if image_encoder_folder.count("/") == 0: + image_encoder_subfolder = Path(subfolder, image_encoder_folder).as_posix() + else: + image_encoder_subfolder = Path(image_encoder_folder).as_posix() + + image_encoder = CLIPVisionModelWithProjection.from_pretrained( + pretrained_model_name_or_path_or_dict, + subfolder=image_encoder_subfolder, + low_cpu_mem_usage=low_cpu_mem_usage, + ).to(self.device, dtype=self.dtype) + self.register_modules(image_encoder=image_encoder) + else: + raise ValueError( + "`image_encoder` cannot be loaded because `pretrained_model_name_or_path_or_dict` is a state dict." + ) else: - raise ValueError("`image_encoder` cannot be None when using IP Adapters.") + logger.warning( + "image_encoder is not loaded since `image_encoder_folder=None` passed. You will not be able to use `ip_adapter_image` when calling the pipeline with IP-Adapter." + "Use `ip_adapter_image_embedding` to pass pre-geneated image embedding instead." + ) # create feature extractor if it has not been registered to the pipeline yet if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None: diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py index db52e75fce23..e98d4ad4e37b 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py @@ -370,7 +370,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -394,13 +394,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents @@ -494,6 +504,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents def prepare_latents( self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None @@ -612,8 +632,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or `np.array`. @@ -717,7 +739,11 @@ def __call__( if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_videos_per_prompt, + self.do_classifier_free_guidance, ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py index 2963d181b274..ced64889044f 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py @@ -448,7 +448,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -472,13 +472,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents @@ -523,6 +533,8 @@ def check_inputs( negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if strength < 0 or strength > 1: @@ -567,6 +579,21 @@ def check_inputs( if video is not None and latents is not None: raise ValueError("Only one of `video` or `latents` should be provided") + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + def get_timesteps(self, num_inference_steps, timesteps, strength, device): # get the original timestep using init_timestep init_timestep = min(int(num_inference_steps * strength), num_inference_steps) @@ -765,8 +792,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or `np.array`. @@ -814,6 +843,8 @@ def __call__( negative_prompt_embeds=negative_prompt_embeds, video=video, latents=latents, + ip_adapter_image=ip_adapter_image, + ip_adapter_image_embeds=ip_adapter_image_embeds, callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs, ) @@ -855,7 +886,11 @@ def __call__( if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_videos_per_prompt, + self.do_classifier_free_guidance, ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 0f5fcfeca3c1..eab2f7aa22d0 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -480,7 +480,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -504,13 +504,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -711,6 +721,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) image_is_tensor = isinstance(image, torch.Tensor) @@ -933,8 +953,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -1076,7 +1098,11 @@ def __call__( if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 4. Prepare image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index fe672db5bf8e..4fc9791d3d8e 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -473,7 +473,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -497,13 +497,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -698,6 +708,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) @@ -951,8 +971,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -1088,7 +1110,11 @@ def __call__( if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 4. Prepare image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 80a430b15592..ce7537d84215 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -598,7 +598,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -622,13 +622,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -856,6 +866,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) @@ -1180,8 +1200,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -1330,7 +1352,11 @@ def __call__( if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 4. Prepare image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py index e42977a7b55a..0f1d5ea48e71 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py @@ -507,7 +507,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -531,13 +531,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -802,6 +812,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + def prepare_control_image( self, image, @@ -1220,8 +1240,10 @@ def __call__( argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. pooled_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. @@ -1411,7 +1433,11 @@ def __call__( # 3.1 Encode ip_adapter_image if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 4. set timesteps diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py index 4e0a880a4a11..9883b4f64790 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py @@ -485,7 +485,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -509,13 +509,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -715,6 +725,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) @@ -998,8 +1018,10 @@ def __call__( argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -1171,7 +1193,11 @@ def __call__( # 3.2 Encode ip_adapter_image if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 4. Prepare image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index 41a8c4fa005e..cf32ae81c562 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -537,7 +537,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -561,13 +561,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -779,6 +789,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_image def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) @@ -1149,8 +1169,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. @@ -1334,7 +1356,11 @@ def __call__( # 3.2 Encode ip_adapter_image if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 4. Prepare image and controlnet_conditioning_image diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index e6adae97ea6c..6d1b1a0db444 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -423,7 +423,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -447,13 +447,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -632,6 +642,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + @property def guidance_scale(self): return self._guidance_scale @@ -720,8 +740,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -794,7 +816,11 @@ def __call__( if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 3. Encode input prompt diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py index 0112fb916369..fa27c0fbd5bc 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py @@ -407,7 +407,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -431,13 +431,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -527,6 +537,8 @@ def check_inputs( width: int, callback_steps: int, prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: @@ -557,6 +569,21 @@ def check_inputs( elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + @property def guidance_scale(self): return self._guidance_scale @@ -645,8 +672,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -699,7 +728,16 @@ def __call__( width = width or self.unet.config.sample_size * self.vae_scale_factor # 1. Check inputs. Raise error if not correct - self.check_inputs(prompt, height, width, callback_steps, prompt_embeds, callback_on_step_end_tensor_inputs) + self.check_inputs( + prompt, + height, + width, + callback_steps, + prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs, + ) self._guidance_scale = guidance_scale self._clip_skip = clip_skip self._cross_attention_kwargs = cross_attention_kwargs @@ -716,7 +754,11 @@ def __call__( if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 3. Encode input prompt diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index 9feebf29d797..485ccb22e5e9 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -577,9 +577,19 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -603,13 +613,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents @@ -798,8 +818,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. motion_scale: (`int`, *optional*, defaults to 0): Parameter that controls the amount and type of motion that is added to the image. Increasing the value increases the amount of motion, while specific ranges of values control the type of motion that is added. Must be between 0 and 8. @@ -891,7 +913,11 @@ def __call__( if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_videos_per_prompt, + self.do_classifier_free_guidance, ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 762565ea1fd3..5126e6f4c378 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -490,7 +490,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -514,13 +514,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds def run_safety_checker(self, image, device, dtype): @@ -624,6 +634,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) if isinstance(generator, list) and len(generator) != batch_size: @@ -772,8 +792,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -885,7 +907,11 @@ def __call__( if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 23b6e0f20520..9c6fbb2310ac 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -534,7 +534,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -558,13 +558,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -670,6 +680,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + def get_timesteps(self, num_inference_steps, strength, device): # get the original timestep using init_timestep init_timestep = min(int(num_inference_steps * strength), num_inference_steps) @@ -868,8 +888,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -967,7 +989,11 @@ def __call__( if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 4. Preprocess image diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 4664da1550f2..3392fd6ddecc 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -606,7 +606,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -630,13 +630,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -752,6 +762,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + def prepare_latents( self, batch_size, @@ -1037,8 +1057,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -1175,7 +1197,11 @@ def __call__( if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 4. set timesteps diff --git a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py index ddbf9ebbb1d0..502cd340bcd8 100644 --- a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py @@ -410,8 +410,9 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, do_classifier_free_guidance, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -441,7 +442,17 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds def run_safety_checker(self, image, device, dtype): @@ -537,6 +548,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) if isinstance(generator, list) and len(generator) != batch_size: @@ -619,8 +640,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -682,9 +705,9 @@ def __call__( image_embeds = self.prepare_ip_adapter_image_embeds( ip_adapter_image, ip_adapter_image_embeds, - do_classifier_free_guidance, device, batch_size * num_images_per_prompt, + do_classifier_free_guidance, ) # 3. Encode input prompt diff --git a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py index 1b371b4746ad..49cc68926b7e 100644 --- a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py @@ -384,7 +384,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -408,13 +408,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker @@ -535,6 +545,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) @@ -644,8 +664,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -710,7 +732,11 @@ def __call__( if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 3. Encode input prompt diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 98028f8be50e..4a34ae89d245 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -519,7 +519,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -543,13 +543,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -656,6 +666,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) @@ -890,8 +910,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. @@ -1093,7 +1115,11 @@ def __call__( if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 8. Denoising loop diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index 58c6f3ff6ebf..ef25ca94d16c 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -611,6 +611,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None): # get the original timestep using init_timestep if denoising_start is None: @@ -742,7 +752,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -766,13 +776,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds def _get_add_time_ids( @@ -1038,8 +1058,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. @@ -1264,7 +1286,11 @@ def denoising_value_valid(dnv): if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 9. Denoising loop diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 7c966c73acf8..2cb946eb56ad 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -456,7 +456,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -480,13 +480,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt @@ -836,6 +846,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + def prepare_latents( self, batch_size, @@ -1290,8 +1310,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. eta (`float`, *optional*, defaults to 0.0): @@ -1612,7 +1634,11 @@ def denoising_value_valid(dnv): if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 11. Denoising loop diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py index 96c7c6857c05..abf743f4f305 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py @@ -533,7 +533,7 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds def prepare_ip_adapter_image_embeds( - self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance ): if ip_adapter_image_embeds is None: if not isinstance(ip_adapter_image, list): @@ -557,13 +557,23 @@ def prepare_ip_adapter_image_embeds( [single_negative_image_embeds] * num_images_per_prompt, dim=0 ) - if self.do_classifier_free_guidance: + if do_classifier_free_guidance: single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) single_image_embeds = single_image_embeds.to(device) image_embeds.append(single_image_embeds) else: - image_embeds = ip_adapter_image_embeds + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + image_embeds.append(single_image_embeds) + return image_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -671,6 +681,16 @@ def check_inputs( "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." ) + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim != 3: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) @@ -914,8 +934,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. @@ -1057,7 +1079,11 @@ def __call__( # 3.2 Encode ip_adapter_image if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, ) # 4. Prepare timesteps diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 0ca464d3bd13..1475e2ff07d2 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -319,6 +319,35 @@ def test_ip_adapter_multi(self, expected_max_diff: float = 1e-4): "Output with multi-ip-adapter scale must be different from normal inference", ) + def test_ip_adapter_cfg(self, expected_max_diff: float = 1e-4): + parameters = inspect.signature(self.pipeline_class.__call__).parameters + + if "guidance_scale" not in parameters: + return + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components).to(torch_device) + pipe.set_progress_bar_config(disable=None) + cross_attention_dim = pipe.unet.config.get("cross_attention_dim", 32) + + adapter_state_dict = create_ip_adapter_state_dict(pipe.unet) + pipe.unet._load_ip_adapter_weights(adapter_state_dict) + pipe.set_ip_adapter_scale(1.0) + + # forward pass with CFG not applied + inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device)) + inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)[0].unsqueeze(0)] + inputs["guidance_scale"] = 1.0 + out_no_cfg = pipe(**inputs)[0] + + # forward pass with CFG applied + inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device)) + inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] + inputs["guidance_scale"] = 7.5 + out_cfg = pipe(**inputs)[0] + + assert out_cfg.shape == out_no_cfg.shape + class PipelineLatentTesterMixin: """ From c2b6ac4e34bae197c179def350e53ae6e7523a48 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 1 Mar 2024 07:18:49 +0530 Subject: [PATCH 12/28] [CI] fix path filtering in the documentation workflows (#7153) fix: path --- .github/workflows/build_documentation.yml | 4 ++-- .github/workflows/build_pr_documentation.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 17d65b07e96e..d9054928ed8d 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -9,8 +9,8 @@ on: - v*-patch paths: - "src/diffusers/**.py" - - "examples/" - - "docs/" + - "examples/**" + - "docs/**" jobs: build: diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 267381fd64e7..8e19d8fafbe3 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -4,8 +4,8 @@ on: pull_request: paths: - "src/diffusers/**.py" - - "examples/" - - "docs/" + - "examples/**" + - "docs/**" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} From 9451235e5a6f094eff429360b72d0f8aff86128a Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 1 Mar 2024 10:11:07 +0530 Subject: [PATCH 13/28] [Urgent][Docker CI] pin `uv` version for now and a minor change in the Slack notification (#7155) pin uv version for now. --- .github/workflows/build_docker_images.yml | 4 ++-- docker/diffusers-flax-cpu/Dockerfile | 2 +- docker/diffusers-flax-tpu/Dockerfile | 2 +- docker/diffusers-onnxruntime-cpu/Dockerfile | 2 +- docker/diffusers-onnxruntime-cuda/Dockerfile | 2 +- docker/diffusers-pytorch-compile-cuda/Dockerfile | 2 +- docker/diffusers-pytorch-cpu/Dockerfile | 2 +- docker/diffusers-pytorch-cuda/Dockerfile | 2 +- docker/diffusers-pytorch-xformers-cuda/Dockerfile | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml index 4179eb171cef..aacbd6f09598 100644 --- a/.github/workflows/build_docker_images.yml +++ b/.github/workflows/build_docker_images.yml @@ -62,13 +62,13 @@ jobs: # For posting a rich message using Block Kit payload: | { - "text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}", + "text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}", "blocks": [ { "type": "section", "text": { "type": "mrkdwn", - "text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}" + "text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}" } } ] diff --git a/docker/diffusers-flax-cpu/Dockerfile b/docker/diffusers-flax-cpu/Dockerfile index ec8aef8cdf3e..36d036e34e5f 100644 --- a/docker/diffusers-flax-cpu/Dockerfile +++ b/docker/diffusers-flax-cpu/Dockerfile @@ -23,7 +23,7 @@ ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) # follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container -RUN python3 -m pip install --no-cache-dir --upgrade pip uv && \ +RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ python3 -m uv pip install --upgrade --no-cache-dir \ clu \ "jax[cpu]>=0.2.16,!=0.3.2" \ diff --git a/docker/diffusers-flax-tpu/Dockerfile b/docker/diffusers-flax-tpu/Dockerfile index c0722139ca55..78d5f972a753 100644 --- a/docker/diffusers-flax-tpu/Dockerfile +++ b/docker/diffusers-flax-tpu/Dockerfile @@ -23,7 +23,7 @@ ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) # follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container -RUN python3 -m pip install --no-cache-dir --upgrade pip uv && \ +RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ python3 -m pip install --no-cache-dir \ "jax[tpu]>=0.2.16,!=0.3.2" \ -f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \ diff --git a/docker/diffusers-onnxruntime-cpu/Dockerfile b/docker/diffusers-onnxruntime-cpu/Dockerfile index 6c43e4cde602..0d032d91e5eb 100644 --- a/docker/diffusers-onnxruntime-cpu/Dockerfile +++ b/docker/diffusers-onnxruntime-cpu/Dockerfile @@ -22,7 +22,7 @@ RUN python3 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) -RUN python3 -m pip install --no-cache-dir --upgrade pip uv && \ +RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ python3 -m uv pip install --no-cache-dir \ torch==2.1.2 \ torchvision==0.16.2 \ diff --git a/docker/diffusers-onnxruntime-cuda/Dockerfile b/docker/diffusers-onnxruntime-cuda/Dockerfile index ce59e039dc25..6cff9cbb56b5 100644 --- a/docker/diffusers-onnxruntime-cuda/Dockerfile +++ b/docker/diffusers-onnxruntime-cuda/Dockerfile @@ -22,7 +22,7 @@ RUN python3 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) -RUN python3 -m pip install --no-cache-dir --upgrade pip uv && \ +RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ python3 -m uv pip install --no-cache-dir \ torch==2.1.2 \ torchvision==0.16.2 \ diff --git a/docker/diffusers-pytorch-compile-cuda/Dockerfile b/docker/diffusers-pytorch-compile-cuda/Dockerfile index c90a441ab7c2..b4f507fcf511 100644 --- a/docker/diffusers-pytorch-compile-cuda/Dockerfile +++ b/docker/diffusers-pytorch-compile-cuda/Dockerfile @@ -24,7 +24,7 @@ RUN python3.9 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) -RUN python3.9 -m pip install --no-cache-dir --upgrade pip uv && \ +RUN python3.9 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ python3.9 -m uv pip install --no-cache-dir \ torch \ torchvision \ diff --git a/docker/diffusers-pytorch-cpu/Dockerfile b/docker/diffusers-pytorch-cpu/Dockerfile index 75473c3069d7..288559bcfcbd 100644 --- a/docker/diffusers-pytorch-cpu/Dockerfile +++ b/docker/diffusers-pytorch-cpu/Dockerfile @@ -23,7 +23,7 @@ RUN python3 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) -RUN python3 -m pip install --no-cache-dir --upgrade pip uv && \ +RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ python3 -m uv pip install --no-cache-dir \ torch \ torchvision \ diff --git a/docker/diffusers-pytorch-cuda/Dockerfile b/docker/diffusers-pytorch-cuda/Dockerfile index 3da6751de9ef..8e0e56bafa02 100644 --- a/docker/diffusers-pytorch-cuda/Dockerfile +++ b/docker/diffusers-pytorch-cuda/Dockerfile @@ -23,7 +23,7 @@ RUN python3 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) -RUN python3 -m pip install --no-cache-dir --upgrade pip uv && \ +RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ python3 -m uv pip install --no-cache-dir \ torch \ torchvision \ diff --git a/docker/diffusers-pytorch-xformers-cuda/Dockerfile b/docker/diffusers-pytorch-xformers-cuda/Dockerfile index ca57bebe3318..90ac0d1ebeaf 100644 --- a/docker/diffusers-pytorch-xformers-cuda/Dockerfile +++ b/docker/diffusers-pytorch-xformers-cuda/Dockerfile @@ -23,7 +23,7 @@ RUN python3 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) -RUN python3 -m pip install --no-cache-dir --upgrade pip uv && \ +RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ python3 -m pip install --no-cache-dir \ torch \ torchvision \ From 5d3923b670a2d4f214fc2d9201277f72a00b7b29 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 1 Mar 2024 10:33:44 +0530 Subject: [PATCH 14/28] Fix LCM benchmark test (#7158) * make workflow dispatchable. * fix: lcm lora compile --- .github/workflows/benchmark.yml | 1 + benchmarks/base_classes.py | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index b3ec74d8c15b..c0ae87521fdc 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -1,6 +1,7 @@ name: Benchmarking tests on: + workflow_dispatch: schedule: - cron: "30 1 1,15 * *" # every 2 weeks on the 1st and the 15th of every month at 1:30 AM diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 38e7663fbf6e..8f99b26f3a07 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -141,6 +141,7 @@ def __init__(self, args): super().__init__(args) self.pipe.load_lora_weights(self.lora_id) self.pipe.fuse_lora() + self.pipe.unload_lora_weights() self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config) def get_result_filepath(self, args): From 64a8cd627aa9aee50f9d74dd738b3c89f2454cce Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Fri, 1 Mar 2024 11:38:21 +0530 Subject: [PATCH 15/28] [CI] Remove max parallel flag on slow test runners (#7162) update --- .github/workflows/push_tests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index 4b6d5bfc3733..74bd3c6e2d53 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -21,7 +21,7 @@ env: jobs: setup_torch_cuda_pipeline_matrix: name: Setup Torch Pipelines CUDA Slow Tests Matrix - runs-on: docker-gpu + runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cpu # this is a CPU image, but we need it to fetch the matrix options: --shm-size "16gb" --ipc host @@ -62,7 +62,6 @@ jobs: needs: setup_torch_cuda_pipeline_matrix strategy: fail-fast: false - max-parallel: 1 matrix: module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }} runs-on: [single-gpu, nvidia-gpu, t4, ci] From 66f8bd6869935e25a8091b00636666d9bfac8d2f Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Fri, 1 Mar 2024 12:26:54 +0100 Subject: [PATCH 16/28] Fix vae_encodings_fn hash in train_text_to_image_sdxl.py (#7171) Update train_text_to_image_sdxl.py --- examples/text_to_image/train_text_to_image_sdxl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py index 78021b5afed4..3ad27b90be25 100644 --- a/examples/text_to_image/train_text_to_image_sdxl.py +++ b/examples/text_to_image/train_text_to_image_sdxl.py @@ -895,7 +895,7 @@ def preprocess_train(examples): # fingerprint used by the cache for the other processes to load the result # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401 new_fingerprint = Hasher.hash(args) - new_fingerprint_for_vae = Hasher.hash("vae") + new_fingerprint_for_vae = Hasher.hash(vae_path) train_dataset = train_dataset.map(compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint) train_dataset = train_dataset.map( compute_vae_encodings_fn, From 5f150c4cef2140342c66ef44ba8f36b5844a51b1 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 1 Mar 2024 19:30:48 +0530 Subject: [PATCH 17/28] fix: loading problem for sdxl lora dreambooth (#7166) --- examples/dreambooth/train_dreambooth_lora_sdxl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py index 85effa19b59d..849becae4a61 100644 --- a/examples/dreambooth/train_dreambooth_lora_sdxl.py +++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py @@ -1178,7 +1178,7 @@ def load_model_hook(models, input_dir): _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_) _set_state_dict_into_text_encoder( - lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_one_ + lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_two_ ) # Make sure the trainable params are in float32. This is again needed since the base models From 9a2600ede9810b907668dc806889237bde4e962a Mon Sep 17 00:00:00 2001 From: Oleh Date: Fri, 1 Mar 2024 18:08:21 +0200 Subject: [PATCH 18/28] Map speedup (#6745) * Speed up dataset mapping * Fix missing columns * Remove cache files cleanup * Update examples/text_to_image/train_text_to_image_sdxl.py * make style * Fix code style * style * Empty-Commit --------- Co-authored-by: Sayak Paul Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Co-authored-by: Quentin Lhoest --- .../text_to_image/train_text_to_image_sdxl.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py index 3ad27b90be25..06885b9207be 100644 --- a/examples/text_to_image/train_text_to_image_sdxl.py +++ b/examples/text_to_image/train_text_to_image_sdxl.py @@ -35,7 +35,7 @@ from accelerate import Accelerator from accelerate.logging import get_logger from accelerate.utils import ProjectConfiguration, set_seed -from datasets import load_dataset +from datasets import concatenate_datasets, load_dataset from huggingface_hub import create_repo, upload_folder from packaging import version from torchvision import transforms @@ -896,13 +896,19 @@ def preprocess_train(examples): # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401 new_fingerprint = Hasher.hash(args) new_fingerprint_for_vae = Hasher.hash(vae_path) - train_dataset = train_dataset.map(compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint) - train_dataset = train_dataset.map( + train_dataset_with_embeddings = train_dataset.map( + compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint + ) + train_dataset_with_vae = train_dataset.map( compute_vae_encodings_fn, batched=True, batch_size=args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps, new_fingerprint=new_fingerprint_for_vae, ) + precomputed_dataset = concatenate_datasets( + [train_dataset_with_embeddings, train_dataset_with_vae.remove_columns(["image", "text"])], axis=1 + ) + precomputed_dataset = precomputed_dataset.with_transform(preprocess_train) del text_encoders, tokenizers, vae gc.collect() @@ -925,7 +931,7 @@ def collate_fn(examples): # DataLoaders creation: train_dataloader = torch.utils.data.DataLoader( - train_dataset, + precomputed_dataset, shuffle=True, collate_fn=collate_fn, batch_size=args.train_batch_size, @@ -976,7 +982,7 @@ def unwrap_model(model): total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") - logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num examples = {len(precomputed_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") From ec953047bc0f4a3542e673f3d463543c02505ca5 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Fri, 1 Mar 2024 09:44:00 -1000 Subject: [PATCH 19/28] [stalebot] fix a bug (#7156) fix Co-authored-by: yiyixuxu --- utils/stale.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/stale.py b/utils/stale.py index 30dddd4475d8..3c0f8ab83339 100644 --- a/utils/stale.py +++ b/utils/stale.py @@ -39,7 +39,7 @@ def main(): open_issues = repo.get_issues(state="open") for issue in open_issues: - labels = [label.name for label in issue.get_labels()] + labels = [label.name.lower() for label in issue.get_labels()] if "stale" in labels: comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True) last_comment = comments[0] if len(comments) > 0 else None @@ -50,7 +50,7 @@ def main(): elif ( (dt.now(timezone.utc) - issue.updated_at).days > 23 and (dt.now(timezone.utc) - issue.created_at).days >= 30 - and not any(label.name.lower() in LABELS_TO_EXEMPT for label in labels) + and not any(label in LABELS_TO_EXEMPT for label in labels) ): # Post a Stalebot notification after 23 days of inactivity. issue.create_comment( From ccb93dcad10ff5c0d2f3e41333f995cf636f6154 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 3 Mar 2024 09:28:57 +0530 Subject: [PATCH 20/28] Support EDM-style training in DreamBooth LoRA SDXL script (#7126) * add: dreambooth lora script for Playground v2.5 * fix: kwarg * address suraj's comments. * Apply suggestions from code review Co-authored-by: Suraj Patil * apply suraj's suggestion * incorporate changes in the canonical script./ * tracker naming * fix: schedule determination * add: two simple tests * remove playground script * note about edm-style training * address pedro's comments. * address part of Suraj's comments. * Apply suggestions from code review Co-authored-by: Suraj Patil * remove guidance_scale. * use mse_loss. * add comments for preconditioning. * quality * Update examples/dreambooth/train_dreambooth_lora_sdxl.py Co-authored-by: Suraj Patil * tackle v-pred. * Empty-Commit * support edm for sdxl too. * address suraj's comments. * Empty-Commit --------- Co-authored-by: Suraj Patil --- examples/dreambooth/README_sdxl.md | 37 ++++ .../dreambooth/test_dreambooth_lora_edm.py | 99 +++++++++ .../dreambooth/train_dreambooth_lora_sdxl.py | 200 +++++++++++++++--- 3 files changed, 307 insertions(+), 29 deletions(-) create mode 100644 examples/dreambooth/test_dreambooth_lora_edm.py diff --git a/examples/dreambooth/README_sdxl.md b/examples/dreambooth/README_sdxl.md index 70b9821553de..aa1aa927c5e9 100644 --- a/examples/dreambooth/README_sdxl.md +++ b/examples/dreambooth/README_sdxl.md @@ -206,3 +206,40 @@ You can explore the results from a couple of our internal experiments by checkin ## Running on a free-tier Colab Notebook Check out [this notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_.ipynb). + +## Conducting EDM-style training + +It's now possible to perform EDM-style training as proposed in [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364). + +For the SDXL model, simple set: + +```diff ++ --do_edm_style_training \ +``` + +Other SDXL-like models that use the EDM formulation, such as [playgroundai/playground-v2.5-1024px-aesthetic](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic), can also be DreamBooth'd with the script. Below is an example command: + +```bash +accelerate launch train_dreambooth_lora_sdxl.py \ + --pretrained_model_name_or_path="playgroundai/playground-v2.5-1024px-aesthetic" \ + --instance_data_dir="dog" \ + --output_dir="dog-playground-lora" \ + --mixed_precision="fp16" \ + --instance_prompt="a photo of sks dog" \ + --resolution=1024 \ + --train_batch_size=1 \ + --gradient_accumulation_steps=4 \ + --learning_rate=1e-4 \ + --use_8bit_adam \ + --report_to="wandb" \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --max_train_steps=500 \ + --validation_prompt="A photo of sks dog in a bucket" \ + --validation_epochs=25 \ + --seed="0" \ + --push_to_hub +``` + +> [!CAUTION] +> Min-SNR gamma is not supported with the EDM-style training yet. When training with the PlaygroundAI model, it's recommended to not pass any "variant". diff --git a/examples/dreambooth/test_dreambooth_lora_edm.py b/examples/dreambooth/test_dreambooth_lora_edm.py new file mode 100644 index 000000000000..0f6b3674b8b3 --- /dev/null +++ b/examples/dreambooth/test_dreambooth_lora_edm.py @@ -0,0 +1,99 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import sys +import tempfile + +import safetensors + + +sys.path.append("..") +from test_examples_utils import ExamplesTestsAccelerate, run_command # noqa: E402 + + +logging.basicConfig(level=logging.DEBUG) + +logger = logging.getLogger() +stream_handler = logging.StreamHandler(sys.stdout) +logger.addHandler(stream_handler) + + +class DreamBoothLoRASDXLWithEDM(ExamplesTestsAccelerate): + def test_dreambooth_lora_sdxl_with_edm(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/dreambooth/train_dreambooth_lora_sdxl.py + --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe + --do_edm_style_training + --instance_data_dir docs/source/en/imgs + --instance_prompt photo + --resolution 64 + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 2 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --output_dir {tmpdir} + """.split() + + run_command(self._launch_args + test_args) + # save_pretrained smoke test + self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))) + + # make sure the state_dict has the correct naming in the parameters. + lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")) + is_lora = all("lora" in k for k in lora_state_dict.keys()) + self.assertTrue(is_lora) + + # when not training the text encoder, all the parameters in the state dict should start + # with `"unet"` in their names. + starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys()) + self.assertTrue(starts_with_unet) + + def test_dreambooth_lora_playground(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/dreambooth/train_dreambooth_lora_sdxl.py + --pretrained_model_name_or_path hf-internal-testing/tiny-playground-v2-5-pipe + --instance_data_dir docs/source/en/imgs + --instance_prompt photo + --resolution 64 + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 2 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --output_dir {tmpdir} + """.split() + + run_command(self._launch_args + test_args) + # save_pretrained smoke test + self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))) + + # make sure the state_dict has the correct naming in the parameters. + lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")) + is_lora = all("lora" in k for k in lora_state_dict.keys()) + self.assertTrue(is_lora) + + # when not training the text encoder, all the parameters in the state dict should start + # with `"unet"` in their names. + starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys()) + self.assertTrue(starts_with_unet) diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py index 849becae4a61..f4d0efc411fd 100644 --- a/examples/dreambooth/train_dreambooth_lora_sdxl.py +++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py @@ -14,8 +14,10 @@ # See the License for the specific language governing permissions and import argparse +import contextlib import gc import itertools +import json import logging import math import os @@ -32,7 +34,7 @@ from accelerate import Accelerator from accelerate.logging import get_logger from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed -from huggingface_hub import create_repo, upload_folder +from huggingface_hub import create_repo, hf_hub_download, upload_folder from huggingface_hub.utils import insecure_hashlib from packaging import version from peft import LoraConfig, set_peft_model_state_dict @@ -50,6 +52,8 @@ AutoencoderKL, DDPMScheduler, DPMSolverMultistepScheduler, + EDMEulerScheduler, + EulerDiscreteScheduler, StableDiffusionXLPipeline, UNet2DConditionModel, ) @@ -76,6 +80,20 @@ logger = get_logger(__name__) +def determine_scheduler_type(pretrained_model_name_or_path, revision): + model_index_filename = "model_index.json" + if os.path.isdir(pretrained_model_name_or_path): + model_index = os.path.join(pretrained_model_name_or_path, model_index_filename) + else: + model_index = hf_hub_download( + repo_id=pretrained_model_name_or_path, filename=model_index_filename, revision=revision + ) + + with open(model_index, "r") as f: + scheduler_type = json.load(f)["scheduler"][1] + return scheduler_type + + def save_model_card( repo_id: str, images=None, @@ -95,7 +113,7 @@ def save_model_card( ) model_description = f""" -# SDXL LoRA DreamBooth - {repo_id} +# {'SDXL' if 'playgroundai' not in base_model else 'Playground'} LoRA DreamBooth - {repo_id} @@ -119,11 +137,17 @@ def save_model_card( [Download]({repo_id}/tree/main) them in the Files & versions tab. +""" + if "playgroundai" in args.pretrained_model_name_or_path: + model_description += """\n +## License + +Please adhere to the licensing terms as described [here](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic/blob/main/LICENSE.md). """ model_card = load_or_create_model_card( repo_id_or_path=repo_id, from_training=True, - license="openrail++", + license="openrail++" if "playgroundai" not in base_model else "playground-v2dot5-community", base_model=base_model, prompt=instance_prompt, model_description=model_description, @@ -131,15 +155,17 @@ def save_model_card( ) tags = [ "text-to-image", - "stable-diffusion-xl", - "stable-diffusion-xl-diffusers", "text-to-image", "diffusers", "lora", "template:sd-lora", ] - model_card = populate_model_card(model_card, tags=tags) + if "playgroundai" in base_model: + tags.extend(["playground", "playground-diffusers"]) + else: + tags.extend(["stable-diffusion-xl", "stable-diffusion-xl-diffusers"]) + model_card = populate_model_card(model_card, tags=tags) model_card.save(os.path.join(repo_folder, "README.md")) @@ -159,23 +185,29 @@ def log_validation( # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it scheduler_args = {} - if "variance_type" in pipeline.scheduler.config: - variance_type = pipeline.scheduler.config.variance_type + if not args.do_edm_style_training: + if "variance_type" in pipeline.scheduler.config: + variance_type = pipeline.scheduler.config.variance_type - if variance_type in ["learned", "learned_range"]: - variance_type = "fixed_small" + if variance_type in ["learned", "learned_range"]: + variance_type = "fixed_small" - scheduler_args["variance_type"] = variance_type + scheduler_args["variance_type"] = variance_type - pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args) + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args) pipeline = pipeline.to(accelerator.device) pipeline.set_progress_bar_config(disable=True) # run inference generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None + # Currently the context determination is a bit hand-wavy. We can improve it in the future if there's a better + # way to condition it. Reference: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051 + inference_ctx = ( + contextlib.nullcontext() if "playgroundai" in args.pretrained_model_name_or_path else torch.cuda.amp.autocast() + ) - with torch.cuda.amp.autocast(): + with inference_ctx: images = [pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)] for tracker in accelerator.trackers: @@ -334,6 +366,12 @@ def parse_args(input_args=None): " `args.validation_prompt` multiple times: `args.num_validation_images`." ), ) + parser.add_argument( + "--do_edm_style_training", + default=False, + action="store_true", + help="Flag to conduct training using the EDM formulation as introduced in https://arxiv.org/abs/2206.00364.", + ) parser.add_argument( "--with_prior_preservation", default=False, @@ -905,6 +943,9 @@ def main(args): " Please use `huggingface-cli login` to authenticate with the Hub." ) + if args.do_edm_style_training and args.snr_gamma is not None: + raise ValueError("Min-SNR formulation is not supported when conducting EDM-style training.") + logging_dir = Path(args.output_dir, args.logging_dir) accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) @@ -1018,7 +1059,19 @@ def main(args): ) # Load scheduler and models - noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") + scheduler_type = determine_scheduler_type(args.pretrained_model_name_or_path, args.revision) + if "EDM" in scheduler_type: + args.do_edm_style_training = True + noise_scheduler = EDMEulerScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") + logger.info("Performing EDM-style training!") + elif args.do_edm_style_training: + noise_scheduler = EulerDiscreteScheduler.from_pretrained( + args.pretrained_model_name_or_path, subfolder="scheduler" + ) + logger.info("Performing EDM-style training!") + else: + noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") + text_encoder_one = text_encoder_cls_one.from_pretrained( args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant ) @@ -1036,6 +1089,12 @@ def main(args): revision=args.revision, variant=args.variant, ) + latents_mean = latents_std = None + if hasattr(vae.config, "latents_mean") and vae.config.latents_mean is not None: + latents_mean = torch.tensor(vae.config.latents_mean).view(1, 4, 1, 1) + if hasattr(vae.config, "latents_std") and vae.config.latents_std is not None: + latents_std = torch.tensor(vae.config.latents_std).view(1, 4, 1, 1) + unet = UNet2DConditionModel.from_pretrained( args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) @@ -1433,7 +1492,12 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # We need to initialize the trackers we use, and also store our configuration. # The trackers initializes automatically on the main process. if accelerator.is_main_process: - accelerator.init_trackers("dreambooth-lora-sd-xl", config=vars(args)) + tracker_name = ( + "dreambooth-lora-sd-xl" + if "playgroundai" not in args.pretrained_model_name_or_path + else "dreambooth-lora-playground" + ) + accelerator.init_trackers(tracker_name, config=vars(args)) # Train! total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps @@ -1485,6 +1549,18 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): disable=not accelerator.is_local_main_process, ) + def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): + sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype) + schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device) + timesteps = timesteps.to(accelerator.device) + + step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] + + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < n_dim: + sigma = sigma.unsqueeze(-1) + return sigma + for epoch in range(first_epoch, args.num_train_epochs): unet.train() if args.train_text_encoder: @@ -1512,22 +1588,46 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # Convert images to latent space model_input = vae.encode(pixel_values).latent_dist.sample() - model_input = model_input * vae.config.scaling_factor - if args.pretrained_vae_model_name_or_path is None: - model_input = model_input.to(weight_dtype) + + if latents_mean is None and latents_std is None: + model_input = model_input * vae.config.scaling_factor + if args.pretrained_vae_model_name_or_path is None: + model_input = model_input.to(weight_dtype) + else: + latents_mean = latents_mean.to(device=model_input.device, dtype=model_input.dtype) + latents_std = latents_std.to(device=model_input.device, dtype=model_input.dtype) + model_input = (model_input - latents_mean) * vae.config.scaling_factor / latents_std + model_input = model_input.to(dtype=weight_dtype) # Sample noise that we'll add to the latents noise = torch.randn_like(model_input) bsz = model_input.shape[0] + # Sample a random timestep for each image - timesteps = torch.randint( - 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device - ) - timesteps = timesteps.long() + if not args.do_edm_style_training: + timesteps = torch.randint( + 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device + ) + timesteps = timesteps.long() + else: + # in EDM formulation, the model is conditioned on the pre-conditioned noise levels + # instead of discrete timesteps, so here we sample indices to get the noise levels + # from `scheduler.timesteps` + indices = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,)) + timesteps = noise_scheduler.timesteps[indices].to(device=model_input.device) # Add noise to the model input according to the noise magnitude at each timestep # (this is the forward diffusion process) noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps) + # For EDM-style training, we first obtain the sigmas based on the continuous timesteps. + # We then precondition the final model inputs based on these sigmas instead of the timesteps. + # Follow: Section 5 of https://arxiv.org/abs/2206.00364. + if args.do_edm_style_training: + sigmas = get_sigmas(timesteps, len(noisy_model_input.shape), noisy_model_input.dtype) + if "EDM" in scheduler_type: + inp_noisy_latents = noise_scheduler.precondition_inputs(noisy_model_input, sigmas) + else: + inp_noisy_latents = noisy_model_input / ((sigmas**2 + 1) ** 0.5) # time ids add_time_ids = torch.cat( @@ -1551,7 +1651,7 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): } prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1) model_pred = unet( - noisy_model_input, + inp_noisy_latents if args.do_edm_style_training else noisy_model_input, timesteps, prompt_embeds_input, added_cond_kwargs=unet_added_conditions, @@ -1570,18 +1670,43 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): ) prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1) model_pred = unet( - noisy_model_input, + inp_noisy_latents if args.do_edm_style_training else noisy_model_input, timesteps, prompt_embeds_input, added_cond_kwargs=unet_added_conditions, return_dict=False, )[0] + weighting = None + if args.do_edm_style_training: + # Similar to the input preconditioning, the model predictions are also preconditioned + # on noised model inputs (before preconditioning) and the sigmas. + # Follow: Section 5 of https://arxiv.org/abs/2206.00364. + if "EDM" in scheduler_type: + model_pred = noise_scheduler.precondition_outputs(noisy_model_input, model_pred, sigmas) + else: + if noise_scheduler.config.prediction_type == "epsilon": + model_pred = model_pred * (-sigmas) + noisy_model_input + elif noise_scheduler.config.prediction_type == "v_prediction": + model_pred = model_pred * (-sigmas / (sigmas**2 + 1) ** 0.5) + ( + noisy_model_input / (sigmas**2 + 1) + ) + # We are not doing weighting here because it tends result in numerical problems. + # See: https://github.com/huggingface/diffusers/pull/7126#issuecomment-1968523051 + # There might be other alternatives for weighting as well: + # https://github.com/huggingface/diffusers/pull/7126#discussion_r1505404686 + if "EDM" not in scheduler_type: + weighting = (sigmas**-2.0).float() + # Get the target for loss depending on the prediction type if noise_scheduler.config.prediction_type == "epsilon": - target = noise + target = model_input if args.do_edm_style_training else noise elif noise_scheduler.config.prediction_type == "v_prediction": - target = noise_scheduler.get_velocity(model_input, noise, timesteps) + target = ( + model_input + if args.do_edm_style_training + else noise_scheduler.get_velocity(model_input, noise, timesteps) + ) else: raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") @@ -1591,10 +1716,28 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): target, target_prior = torch.chunk(target, 2, dim=0) # Compute prior loss - prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean") + if weighting is not None: + prior_loss = torch.mean( + (weighting.float() * (model_pred_prior.float() - target_prior.float()) ** 2).reshape( + target_prior.shape[0], -1 + ), + 1, + ) + prior_loss = prior_loss.mean() + else: + prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean") if args.snr_gamma is None: - loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + if weighting is not None: + loss = torch.mean( + (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape( + target.shape[0], -1 + ), + 1, + ) + loss = loss.mean() + else: + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") else: # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556. # Since we predict the noise instead of x_0, the original formulation is slightly changed. @@ -1696,7 +1839,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): variant=args.variant, torch_dtype=weight_dtype, ) - pipeline_args = {"prompt": args.validation_prompt} images = log_validation( From f55873b783e2739124360d869955f9218817c211 Mon Sep 17 00:00:00 2001 From: Junsong Chen Date: Sun, 3 Mar 2024 13:01:21 +0800 Subject: [PATCH 21/28] Fix PixArt 256px inference (#6789) * feat 256px diffusers inference bug * change the max_length of T5 to pipeline config file * fix bug in convert_pixart_alpha_to_diffusers.py * Update scripts/convert_pixart_alpha_to_diffusers.py Co-authored-by: Sayak Paul * remove multi_scale_train parser * Update src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py Co-authored-by: YiYi Xu * Update src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py Co-authored-by: YiYi Xu * styling * change `model_token_max_length` to call argument. * Refactoring * add: max_sequence_length to the docstring. --------- Co-authored-by: Sayak Paul Co-authored-by: YiYi Xu --- scripts/convert_pixart_alpha_to_diffusers.py | 8 +-- .../models/transformers/transformer_2d.py | 6 +- .../pixart_alpha/pipeline_pixart_alpha.py | 56 +++++++++++++++++-- 3 files changed, 59 insertions(+), 11 deletions(-) diff --git a/scripts/convert_pixart_alpha_to_diffusers.py b/scripts/convert_pixart_alpha_to_diffusers.py index fc037c87f5d5..228b479df090 100644 --- a/scripts/convert_pixart_alpha_to_diffusers.py +++ b/scripts/convert_pixart_alpha_to_diffusers.py @@ -9,11 +9,11 @@ ckpt_id = "PixArt-alpha/PixArt-alpha" # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/scripts/inference.py#L125 -interpolation_scale = {512: 1, 1024: 2} +interpolation_scale = {256: 0.5, 512: 1, 1024: 2} def main(args): - all_state_dict = torch.load(args.orig_ckpt_path) + all_state_dict = torch.load(args.orig_ckpt_path, map_location="cpu") state_dict = all_state_dict.pop("state_dict") converted_state_dict = {} @@ -22,7 +22,6 @@ def main(args): converted_state_dict["pos_embed.proj.bias"] = state_dict.pop("x_embedder.proj.bias") # Caption projection. - converted_state_dict["caption_projection.y_embedding"] = state_dict.pop("y_embedder.y_embedding") converted_state_dict["caption_projection.linear_1.weight"] = state_dict.pop("y_embedder.y_proj.fc1.weight") converted_state_dict["caption_projection.linear_1.bias"] = state_dict.pop("y_embedder.y_proj.fc1.bias") converted_state_dict["caption_projection.linear_2.weight"] = state_dict.pop("y_embedder.y_proj.fc2.weight") @@ -155,6 +154,7 @@ def main(args): assert transformer.pos_embed.pos_embed is not None state_dict.pop("pos_embed") + state_dict.pop("y_embedder.y_embedding") assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}" num_model_params = sum(p.numel() for p in transformer.parameters()) @@ -187,7 +187,7 @@ def main(args): "--image_size", default=1024, type=int, - choices=[512, 1024], + choices=[256, 512, 1024], required=False, help="Image size of pretrained model, either 512 or 1024.", ) diff --git a/src/diffusers/models/transformers/transformer_2d.py b/src/diffusers/models/transformers/transformer_2d.py index aaf5f2a901b8..bd632660f46c 100644 --- a/src/diffusers/models/transformers/transformer_2d.py +++ b/src/diffusers/models/transformers/transformer_2d.py @@ -97,6 +97,7 @@ def __init__( norm_eps: float = 1e-5, attention_type: str = "default", caption_channels: int = None, + interpolation_scale: float = None, ): super().__init__() self.use_linear_projection = use_linear_projection @@ -168,8 +169,9 @@ def __init__( self.width = sample_size self.patch_size = patch_size - interpolation_scale = self.config.sample_size // 64 # => 64 (= 512 pixart) has interpolation scale 1 - interpolation_scale = max(interpolation_scale, 1) + interpolation_scale = ( + interpolation_scale if interpolation_scale is not None else max(self.config.sample_size // 64, 1) + ) self.pos_embed = PatchEmbed( height=sample_size, width=sample_size, diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py index b4453e63d28f..c12bca90aa14 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py @@ -133,6 +133,42 @@ "4.0": [1024.0, 256.0], } +ASPECT_RATIO_256_BIN = { + "0.25": [128.0, 512.0], + "0.28": [128.0, 464.0], + "0.32": [144.0, 448.0], + "0.33": [144.0, 432.0], + "0.35": [144.0, 416.0], + "0.4": [160.0, 400.0], + "0.42": [160.0, 384.0], + "0.48": [176.0, 368.0], + "0.5": [176.0, 352.0], + "0.52": [176.0, 336.0], + "0.57": [192.0, 336.0], + "0.6": [192.0, 320.0], + "0.68": [208.0, 304.0], + "0.72": [208.0, 288.0], + "0.78": [224.0, 288.0], + "0.82": [224.0, 272.0], + "0.88": [240.0, 272.0], + "0.94": [240.0, 256.0], + "1.0": [256.0, 256.0], + "1.07": [256.0, 240.0], + "1.13": [272.0, 240.0], + "1.21": [272.0, 224.0], + "1.29": [288.0, 224.0], + "1.38": [288.0, 208.0], + "1.46": [304.0, 208.0], + "1.67": [320.0, 192.0], + "1.75": [336.0, 192.0], + "2.0": [352.0, 176.0], + "2.09": [368.0, 176.0], + "2.4": [384.0, 160.0], + "2.5": [400.0, 160.0], + "3.0": [432.0, 144.0], + "4.0": [512.0, 128.0], +} + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps def retrieve_timesteps( @@ -260,6 +296,7 @@ def encode_prompt( prompt_attention_mask: Optional[torch.FloatTensor] = None, negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, clean_caption: bool = False, + max_sequence_length: int = 120, **kwargs, ): r""" @@ -284,8 +321,9 @@ def encode_prompt( negative_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the "" string. - clean_caption (bool, defaults to `False`): + clean_caption (`bool`, defaults to `False`): If `True`, the function will preprocess and clean the provided caption before encoding. + max_sequence_length (`int`, defaults to 120): Maximum sequence length to use for the prompt. """ if "mask_feature" in kwargs: @@ -303,7 +341,7 @@ def encode_prompt( batch_size = prompt_embeds.shape[0] # See Section 3.1. of the paper. - max_length = 120 + max_length = max_sequence_length if prompt_embeds is None: prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) @@ -688,6 +726,7 @@ def __call__( callback_steps: int = 1, clean_caption: bool = True, use_resolution_binning: bool = True, + max_sequence_length: int = 120, **kwargs, ) -> Union[ImagePipelineOutput, Tuple]: """ @@ -757,6 +796,7 @@ def __call__( If set to `True`, the requested height and width are first mapped to the closest resolutions using `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to the requested resolution. Useful for generating non-square images. + max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`. Examples: @@ -772,9 +812,14 @@ def __call__( height = height or self.transformer.config.sample_size * self.vae_scale_factor width = width or self.transformer.config.sample_size * self.vae_scale_factor if use_resolution_binning: - aspect_ratio_bin = ( - ASPECT_RATIO_1024_BIN if self.transformer.config.sample_size == 128 else ASPECT_RATIO_512_BIN - ) + if self.transformer.config.sample_size == 128: + aspect_ratio_bin = ASPECT_RATIO_1024_BIN + elif self.transformer.config.sample_size == 64: + aspect_ratio_bin = ASPECT_RATIO_512_BIN + elif self.transformer.config.sample_size == 32: + aspect_ratio_bin = ASPECT_RATIO_256_BIN + else: + raise ValueError("Invalid sample size") orig_height, orig_width = height, width height, width = self.classify_height_width_bin(height, width, ratios=aspect_ratio_bin) @@ -822,6 +867,7 @@ def __call__( prompt_attention_mask=prompt_attention_mask, negative_prompt_attention_mask=negative_prompt_attention_mask, clean_caption=clean_caption, + max_sequence_length=max_sequence_length, ) if do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) From 001b14023e7b8b69366abe2840796d8d4b5d1283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Somoza?= Date: Sun, 3 Mar 2024 04:37:00 -0300 Subject: [PATCH 22/28] [ip-adapter] fix problem using embeds with the plus version of ip adapters (#7189) * initial * check_inputs fix to the rest of pipelines * add fix for no cfg too * use of variable --------- Co-authored-by: Sayak Paul --- .../animatediff/pipeline_animatediff.py | 17 ++++++++++++----- .../pipeline_animatediff_video2video.py | 17 ++++++++++++----- .../pipelines/controlnet/pipeline_controlnet.py | 17 ++++++++++++----- .../controlnet/pipeline_controlnet_img2img.py | 17 ++++++++++++----- .../controlnet/pipeline_controlnet_inpaint.py | 17 ++++++++++++----- .../pipeline_controlnet_inpaint_sd_xl.py | 17 ++++++++++++----- .../controlnet/pipeline_controlnet_sd_xl.py | 17 ++++++++++++----- .../pipeline_controlnet_sd_xl_img2img.py | 17 ++++++++++++----- .../pipeline_latent_consistency_img2img.py | 17 ++++++++++++----- .../pipeline_latent_consistency_text2img.py | 17 ++++++++++++----- src/diffusers/pipelines/pia/pipeline_pia.py | 17 ++++++++++++----- .../pipeline_stable_diffusion.py | 17 ++++++++++++----- .../pipeline_stable_diffusion_img2img.py | 17 ++++++++++++----- .../pipeline_stable_diffusion_inpaint.py | 17 ++++++++++++----- .../pipeline_stable_diffusion_ldm3d.py | 17 ++++++++++++----- .../pipeline_stable_diffusion_panorama.py | 17 ++++++++++++----- .../pipeline_stable_diffusion_xl.py | 17 ++++++++++++----- .../pipeline_stable_diffusion_xl_img2img.py | 17 ++++++++++++----- .../pipeline_stable_diffusion_xl_inpaint.py | 17 ++++++++++++----- .../pipeline_stable_diffusion_xl_adapter.py | 17 ++++++++++++----- 20 files changed, 240 insertions(+), 100 deletions(-) diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py index e98d4ad4e37b..ce2f6585c601 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py @@ -400,15 +400,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -509,9 +516,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py index ced64889044f..bcfcd3a24b5d 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py @@ -478,15 +478,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -589,9 +596,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) def get_timesteps(self, num_inference_steps, timesteps, strength, device): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index eab2f7aa22d0..8f31dfc2678a 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -510,15 +510,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -726,9 +733,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) def check_image(self, image, prompt, prompt_embeds): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 4fc9791d3d8e..9d2c76fd7483 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -503,15 +503,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -713,9 +720,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index ce7537d84215..c4f1bff5efcd 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -628,15 +628,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -871,9 +878,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py index 0f1d5ea48e71..52ffe5a3f356 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py @@ -537,15 +537,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -817,9 +824,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) def prepare_control_image( diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py index 9883b4f64790..0b611350a6f1 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py @@ -515,15 +515,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -730,9 +737,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index cf32ae81c562..4deee37f7df1 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -567,15 +567,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -794,9 +801,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_image diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index 6d1b1a0db444..f64854ea982b 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -453,15 +453,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -647,9 +654,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) @property diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py index fa27c0fbd5bc..e9bacaa89ba5 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py @@ -437,15 +437,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -579,9 +586,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) @property diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index 485ccb22e5e9..bd3e2891f0d6 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -582,9 +582,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds @@ -619,15 +619,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 5126e6f4c378..9e4e6c186ffa 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -520,15 +520,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -639,9 +646,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 9c6fbb2310ac..b43e0eb2abcd 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -564,15 +564,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -685,9 +692,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) def get_timesteps(self, num_inference_steps, strength, device): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 3392fd6ddecc..221d5c2cfd3f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -636,15 +636,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -767,9 +774,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) def prepare_latents( diff --git a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py index 502cd340bcd8..dbfb5e08ef23 100644 --- a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py @@ -442,15 +442,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -553,9 +560,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): diff --git a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py index 49cc68926b7e..feda710e0049 100644 --- a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py @@ -414,15 +414,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -550,9 +557,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 4a34ae89d245..776696e9d486 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -549,15 +549,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -671,9 +678,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index ef25ca94d16c..5ba12baad065 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -616,9 +616,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None): @@ -782,15 +782,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 2cb946eb56ad..5b9628f51a41 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -486,15 +486,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -851,9 +858,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) def prepare_latents( diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py index abf743f4f305..4e0cc61f5c1d 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py @@ -563,15 +563,22 @@ def prepare_ip_adapter_image_embeds( image_embeds.append(single_image_embeds) else: + repeat_dims = [1] image_embeds = [] for single_image_embeds in ip_adapter_image_embeds: if do_classifier_free_guidance: single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) - single_negative_image_embeds = single_negative_image_embeds.repeat(num_images_per_prompt, 1, 1) - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) else: - single_image_embeds = single_image_embeds.repeat(num_images_per_prompt, 1, 1) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) image_embeds.append(single_image_embeds) return image_embeds @@ -686,9 +693,9 @@ def check_inputs( raise ValueError( f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" ) - elif ip_adapter_image_embeds[0].ndim != 3: + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: raise ValueError( - f"`ip_adapter_image_embeds` has to be a list of 3D tensors but is {ip_adapter_image_embeds[0].ndim}D" + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" ) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents From 03cd62520ffff950e5ca62cf2e8d7d3a0cd2f5bb Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 3 Mar 2024 15:10:55 +0530 Subject: [PATCH 23/28] feat: add ip adapter benchmark (#6936) * feat: add ip adapter benchmark * sdxl support too. * Empty-Commit --- benchmarks/base_classes.py | 29 ++++++++++++++++++++++++++ benchmarks/benchmark_ip_adapters.py | 32 +++++++++++++++++++++++++++++ benchmarks/run_all.py | 2 +- 3 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 benchmarks/benchmark_ip_adapters.py diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 8f99b26f3a07..dc1ca72388af 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -236,6 +236,35 @@ def run_inference(self, pipe, args): ) +class IPAdapterTextToImageBenchmark(TextToImageBenchmark): + url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png" + image = load_image(url) + + def __init__(self, args): + pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16).to("cuda") + pipe.load_ip_adapter( + args.ip_adapter_id[0], + subfolder="models" if "sdxl" not in args.ip_adapter_id[1] else "sdxl_models", + weight_name=args.ip_adapter_id[1], + ) + + if args.run_compile: + pipe.unet.to(memory_format=torch.channels_last) + print("Run torch compile") + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + + pipe.set_progress_bar_config(disable=True) + self.pipe = pipe + + def run_inference(self, pipe, args): + _ = pipe( + prompt=PROMPT, + ip_adapter_image=self.image, + num_inference_steps=args.num_inference_steps, + num_images_per_prompt=args.batch_size, + ) + + class ControlNetBenchmark(TextToImageBenchmark): pipeline_class = StableDiffusionControlNetPipeline aux_network_class = ControlNetModel diff --git a/benchmarks/benchmark_ip_adapters.py b/benchmarks/benchmark_ip_adapters.py new file mode 100644 index 000000000000..5c11ab38383c --- /dev/null +++ b/benchmarks/benchmark_ip_adapters.py @@ -0,0 +1,32 @@ +import argparse +import sys + + +sys.path.append(".") +from base_classes import IPAdapterTextToImageBenchmark # noqa: E402 + + +IP_ADAPTER_CKPTS = { + "runwayml/stable-diffusion-v1-5": ("h94/IP-Adapter", "ip-adapter_sd15.bin"), + "stabilityai/stable-diffusion-xl-base-1.0": ("h94/IP-Adapter", "ip-adapter_sdxl.bin"), +} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--ckpt", + type=str, + default="runwayml/stable-diffusion-v1-5", + choices=list(IP_ADAPTER_CKPTS.keys()), + ) + parser.add_argument("--batch_size", type=int, default=1) + parser.add_argument("--num_inference_steps", type=int, default=50) + parser.add_argument("--model_cpu_offload", action="store_true") + parser.add_argument("--run_compile", action="store_true") + args = parser.parse_args() + + args.ip_adapter_id = IP_ADAPTER_CKPTS[args.ckpt] + benchmark_pipe = IPAdapterTextToImageBenchmark(args) + args.ckpt = f"{args.ckpt} (IP-Adapter)" + benchmark_pipe.benchmark(args) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index c70fb2227383..8750e1333d9d 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -72,7 +72,7 @@ def main(): command += " --run_compile" run_command(command.split()) - elif file == "benchmark_sd_inpainting.py": + elif file in ["benchmark_sd_inpainting.py", "benchmark_ip_adapters.py"]: sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" command = f"python {file} --ckpt {sdxl_ckpt}" run_command(command.split()) From b9e1c30d0e314b563c1b7ed59c520ebf743dec9f Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 4 Mar 2024 08:55:30 +0530 Subject: [PATCH 24/28] [Docs] more elaborate example for peft `torch.compile` (#7161) more elaborate example for peft torch.compile --- docs/source/en/tutorials/using_peft_for_inference.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/source/en/tutorials/using_peft_for_inference.md b/docs/source/en/tutorials/using_peft_for_inference.md index fb209a7a8f1f..22708b8750cf 100644 --- a/docs/source/en/tutorials/using_peft_for_inference.md +++ b/docs/source/en/tutorials/using_peft_for_inference.md @@ -169,7 +169,7 @@ list_adapters_component_wise If you want to compile your model with `torch.compile` make sure to first fuse the LoRA weights into the base model and unload them. -```py +```diff pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy") @@ -178,12 +178,16 @@ pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0]) pipe.fuse_lora() pipe.unload_lora_weights() -pipe = torch.compile(pipe) ++ pipe.unet.to(memory_format=torch.channels_last) ++ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) prompt = "toy_face of a hacker with a hoodie, pixel art" image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0] ``` +> [!TIP] +> You can refer to the `torch.compile()` section [here](https://huggingface.co/docs/diffusers/main/en/optimization/torch2.0#torchcompile) and [here](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile) for more elaborate examples. + ## Fusing adapters into the model You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~diffusers.loaders.LoraLoaderMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage. From 2ca264244b3c234c6b12533fc1421182c22a3a53 Mon Sep 17 00:00:00 2001 From: "Vinh H. Pham" Date: Mon, 4 Mar 2024 11:44:14 +0700 Subject: [PATCH 25/28] adding `callback_on_step_end` for `StableDiffusionLDM3DPipeline` (#7149) * refactor callback * run make style * add copy --------- Co-authored-by: Sayak Paul --- .../pipeline_stable_diffusion_ldm3d.py | 210 ++++++++++++++++-- 1 file changed, 191 insertions(+), 19 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py index dbfb5e08ef23..c7c05feaf013 100644 --- a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py @@ -59,6 +59,66 @@ """ +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg +def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): + """ + Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) + std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) + # rescale the results from guidance (fixes overexposure) + noise_pred_rescaled = noise_cfg * (std_text / std_cfg) + # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images + noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg + return noise_cfg + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + **kwargs, +): + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, + `timesteps` must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default + timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` + must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + @dataclass class LDM3DPipelineOutput(BaseOutput): """ @@ -125,6 +185,7 @@ class StableDiffusionLDM3DPipeline( model_cpu_offload_seq = "text_encoder->unet->vae" _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] _exclude_from_cpu_offload = ["safety_checker"] + _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] def __init__( self, @@ -582,6 +643,66 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype latents = latents * self.scheduler.init_noise_sigma return latents + # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding + def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): + """ + See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + + Args: + timesteps (`torch.Tensor`): + generate embedding vectors at these timesteps + embedding_dim (`int`, *optional*, defaults to 512): + dimension of the embeddings to generate + dtype: + data type of the generated embeddings + + Returns: + `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def guidance_rescale(self): + return self._guidance_rescale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def interrupt(self): + return self._interrupt + @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( @@ -590,6 +711,7 @@ def __call__( height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 49, + timesteps: List[int] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, @@ -602,10 +724,12 @@ def __call__( ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, - callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, clip_skip: Optional[int] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, ): r""" The call function to the pipeline for generation. @@ -656,18 +780,21 @@ def __call__( return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple. - callback (`Callable`, *optional*): - A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, *optional*, defaults to 1): - The frequency at which the `callback` function is called. If not specified, the callback is called at - every step. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. Examples: Returns: @@ -677,6 +804,22 @@ def __call__( second element is a list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content. """ + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor width = width or self.unet.config.sample_size * self.vae_scale_factor @@ -692,8 +835,15 @@ def __call__( negative_prompt_embeds, ip_adapter_image, ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs, ) + self._guidance_scale = guidance_scale + self._guidance_rescale = guidance_rescale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + self._interrupt = False + # 2. Define call parameters if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -703,10 +853,6 @@ def __call__( batch_size = prompt_embeds.shape[0] device = self._execution_device - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 if ip_adapter_image is not None or ip_adapter_image_embeds is not None: image_embeds = self.prepare_ip_adapter_image_embeds( @@ -714,7 +860,7 @@ def __call__( ip_adapter_image_embeds, device, batch_size * num_images_per_prompt, - do_classifier_free_guidance, + self.do_classifier_free_guidance, ) # 3. Encode input prompt @@ -722,7 +868,7 @@ def __call__( prompt, device, num_images_per_prompt, - do_classifier_free_guidance, + self.do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, @@ -731,12 +877,11 @@ def __call__( # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - if do_classifier_free_guidance: + if self.do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) # 5. Prepare latent variables num_channels_latents = self.unet.config.in_channels @@ -757,12 +902,24 @@ def __call__( # 6.1 Add image embeds for IP-Adapter added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None + # 6.2 Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + # 7. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + self._num_timesteps = len(timesteps) with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): + if self.interrupt: + continue + # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual @@ -770,19 +927,34 @@ def __call__( latent_model_input, t, encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, cross_attention_kwargs=cross_attention_kwargs, added_cond_kwargs=added_cond_kwargs, return_dict=False, )[0] # perform guidance - if do_classifier_free_guidance: + if self.do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + if self.do_classifier_free_guidance and self.guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale) + # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() From 56b68459f50f7d3af383a53b02e298a6532f3084 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 4 Mar 2024 11:29:01 +0530 Subject: [PATCH 26/28] Update requirements.txt to remove huggingface-cli (#7202) Internal message: https://huggingface.slack.com/archives/C03Q18WK18T/p1709529892062479 --- examples/wuerstchen/text_to_image/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/wuerstchen/text_to_image/requirements.txt b/examples/wuerstchen/text_to_image/requirements.txt index f734c1659d32..dfe8f3db46a2 100644 --- a/examples/wuerstchen/text_to_image/requirements.txt +++ b/examples/wuerstchen/text_to_image/requirements.txt @@ -2,7 +2,6 @@ accelerate>=0.16.0 torchvision transformers>=4.25.1 wandb -huggingface-cli bitsandbytes deepspeed peft>=0.6.0 From 01ee0978cc7e56c1ba7ff0e455ec416bcdeb69dc Mon Sep 17 00:00:00 2001 From: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com> Date: Mon, 4 Mar 2024 11:07:54 +0100 Subject: [PATCH 27/28] [advanced dreambooth lora sdxl] add DoRA training feature (#7072) * add is_dora arg * style * add dora training feature to sd 1.5 script * added notes about DoRA training --------- Co-authored-by: Sayak Paul --- .../advanced_diffusion_training/README.md | 29 +++++++++++++++++-- .../train_dreambooth_lora_sd15_advanced.py | 12 ++++++++ .../train_dreambooth_lora_sdxl_advanced.py | 12 ++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/examples/advanced_diffusion_training/README.md b/examples/advanced_diffusion_training/README.md index 0a49284543d2..d1c2ff71e639 100644 --- a/examples/advanced_diffusion_training/README.md +++ b/examples/advanced_diffusion_training/README.md @@ -80,8 +80,7 @@ To do so, just specify `--train_text_encoder_ti` while launching training (for r Please keep the following points in mind: * SDXL has two text encoders. So, we fine-tune both using LoRA. -* When not fine-tuning the text encoders, we ALWAYS precompute the text embeddings to save memoםהקרry. - +* When not fine-tuning the text encoders, we ALWAYS precompute the text embeddings to save memory. ### 3D icon example @@ -234,6 +233,32 @@ In ComfyUI we will load a LoRA and a textual embedding at the same time. SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)). +### DoRA training +The advanced script now supports DoRA training too! +> Proposed in [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353), +**DoRA** is very similar to LoRA, except it decomposes the pre-trained weight into two components, **magnitude** and **direction** and employs LoRA for _directional_ updates to efficiently minimize the number of trainable parameters. +The authors found that by using DoRA, both the learning capacity and training stability of LoRA are enhanced without any additional overhead during inference. + +> [!NOTE] +> 💡DoRA training is still _experimental_ +> and is likely to require different hyperparameter values to perform best compared to a LoRA. +> Specifically, we've noticed 2 differences to take into account your training: +> 1. **LoRA seem to converge faster than DoRA** (so a set of parameters that may lead to overfitting when training a LoRA may be working well for a DoRA) +> 2. **DoRA quality superior to LoRA especially in lower ranks** the difference in quality of DoRA of rank 8 and LoRA of rank 8 appears to be more significant than when training ranks of 32 or 64 for example. +> This is also aligned with some of the quantitative analysis shown in the paper. + +**Usage** +1. To use DoRA you need to install `peft` from main: +```bash +pip install git+https://github.com/huggingface/peft.git +``` +2. Enable DoRA training by adding this flag +```bash +--use_dora +``` +**Inference** +The inference is the same as if you train a regular LoRA 🤗 + ### Tips and Tricks Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices) diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py index 778e90e2eb25..41284fbd9a33 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py @@ -651,6 +651,16 @@ def parse_args(input_args=None): default=4, help=("The dimension of the LoRA update matrices."), ) + parser.add_argument( + "--use_dora", + type=bool, + action="store_true", + default=False, + help=( + "Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. " + "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`" + ), + ) parser.add_argument( "--cache_latents", action="store_true", @@ -1219,6 +1229,7 @@ def main(args): unet_lora_config = LoraConfig( r=args.rank, lora_alpha=args.rank, + use_dora=args.use_dora, init_lora_weights="gaussian", target_modules=["to_k", "to_q", "to_v", "to_out.0"], ) @@ -1230,6 +1241,7 @@ def main(args): text_lora_config = LoraConfig( r=args.rank, lora_alpha=args.rank, + use_dora=args.use_dora, init_lora_weights="gaussian", target_modules=["q_proj", "k_proj", "v_proj", "out_proj"], ) diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py index 94a32bcc07f8..30117dcaeec1 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py @@ -661,6 +661,16 @@ def parse_args(input_args=None): default=4, help=("The dimension of the LoRA update matrices."), ) + parser.add_argument( + "--use_dora", + type=bool, + action="store_true", + default=False, + help=( + "Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. " + "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`" + ), + ) parser.add_argument( "--cache_latents", action="store_true", @@ -1323,6 +1333,7 @@ def main(args): unet_lora_config = LoraConfig( r=args.rank, lora_alpha=args.rank, + use_dora=args.use_dora, init_lora_weights="gaussian", target_modules=["to_k", "to_q", "to_v", "to_out.0"], ) @@ -1334,6 +1345,7 @@ def main(args): text_lora_config = LoraConfig( r=args.rank, lora_alpha=args.rank, + use_dora=args.use_dora, init_lora_weights="gaussian", target_modules=["q_proj", "k_proj", "v_proj", "out_proj"], ) From 869bad3e52c86a33452858901090dd86ff4bebe1 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 4 Mar 2024 15:46:39 +0530 Subject: [PATCH 28/28] FIx torch and cuda version in ONNX tests (#7164) update Co-authored-by: Sayak Paul --- docker/diffusers-onnxruntime-cuda/Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/diffusers-onnxruntime-cuda/Dockerfile b/docker/diffusers-onnxruntime-cuda/Dockerfile index 6cff9cbb56b5..34e611df7257 100644 --- a/docker/diffusers-onnxruntime-cuda/Dockerfile +++ b/docker/diffusers-onnxruntime-cuda/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04 LABEL maintainer="Hugging Face" LABEL repository="diffusers" @@ -24,9 +24,9 @@ ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ python3 -m uv pip install --no-cache-dir \ - torch==2.1.2 \ - torchvision==0.16.2 \ - torchaudio==2.1.2 \ + torch \ + torchvision \ + torchaudio \ "onnxruntime-gpu>=1.13.1" \ --extra-index-url https://download.pytorch.org/whl/cu117 && \ python3 -m uv pip install --no-cache-dir \