From 626284f8d14203bcb67eea095897071170f6e3aa Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Thu, 7 Sep 2023 19:05:28 +0200
Subject: [PATCH 01/37] [StableDiffusionXLAdapterPipeline] add
 adapter_conditioning_factor (#4937)

add adapter_conditioning_factor
---
 .../pipeline_stable_diffusion_xl_adapter.py         | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
index 9bb8569e331d..a0cac5657baf 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -656,6 +656,7 @@ def __call__(
         crops_coords_top_left: Tuple[int, int] = (0, 0),
         target_size: Optional[Tuple[int, int]] = None,
         adapter_conditioning_scale: Union[float, List[float]] = 1.0,
+        adapter_conditioning_factor: float = 1.0,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -767,6 +768,10 @@ def __call__(
                 The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
                 residual in the original unet. If multiple adapters are specified in init, you can set the
                 corresponding scale as a list.
+            adapter_conditioning_factor (`float`, *optional*, defaults to 1.0):
+                The fraction of timesteps for which adapter should be applied. If `adapter_conditioning_factor` is
+                `0.0`, adapter is not applied at all. If `adapter_conditioning_factor` is `1.0`, adapter is applied for
+                all timesteps. If `adapter_conditioning_factor` is `0.5`, adapter is applied for half of the timesteps.
         Examples:
 
         Returns:
@@ -904,6 +909,12 @@ def __call__(
 
                 # predict the noise residual
                 added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                if i < int(num_inference_steps * adapter_conditioning_factor):
+                    down_block_additional_residuals = [state.clone() for state in adapter_state]
+                else:
+                    down_block_additional_residuals = None
+
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
@@ -911,7 +922,7 @@ def __call__(
                     cross_attention_kwargs=cross_attention_kwargs,
                     added_cond_kwargs=added_cond_kwargs,
                     return_dict=False,
-                    down_block_additional_residuals=[state.clone() for state in adapter_state],
+                    down_block_additional_residuals=down_block_additional_residuals,
                 )[0]
 
                 # perform guidance

From 0ec7a02b6a609a31b442cdf18962d7238c5be25d Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Fri, 8 Sep 2023 04:29:42 +0200
Subject: [PATCH 02/37] [StableDiffusionXLAdapterPipeline] allow negative micro
 conds (#4941)

* allow negative micro conds in t2i pipeline

* Empty-Commit

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 .../pipeline_stable_diffusion_xl_adapter.py   | 30 ++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
index a0cac5657baf..9809e1dddfee 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -655,6 +655,9 @@ def __call__(
         original_size: Optional[Tuple[int, int]] = None,
         crops_coords_top_left: Tuple[int, int] = (0, 0),
         target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
         adapter_conditioning_scale: Union[float, List[float]] = 1.0,
         adapter_conditioning_factor: float = 1.0,
     ):
@@ -764,6 +767,22 @@ def __call__(
                 For most cases, `target_size` should be set to the desired height and width of the generated image. If
                 not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
                 section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
             adapter_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
                 The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
                 residual in the original unet. If multiple adapters are specified in init, you can set the
@@ -876,11 +895,20 @@ def __call__(
         add_time_ids = self._get_add_time_ids(
             original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
         )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
 
         if do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
             add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
-            add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
 
         prompt_embeds = prompt_embeds.to(device)
         add_text_embeds = add_text_embeds.to(device)

From dfec61f4b3797bb140b570456ff44a00d7f52502 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Fri, 8 Sep 2023 06:33:02 +0200
Subject: [PATCH 03/37] [examples] T2IAdapter training script (#4934)

* add t2i_example script

* remove in channels logic

* remove comments

* remove use_euler arg

* add requirements

* only use canny example

* use datasets

* comments

* make log_validation consistent with other scripts

* add readme

* fix title in readme

* update check_min_version

* change a few minor things.

* add doc entry

* add: test for t2i adapter training

* remove use_auth_token

* fix: logged info.

* remove tests for now.

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/training/t2i_adapters.md       |  143 ++
 examples/t2i_adapter/README.md                |    1 +
 examples/t2i_adapter/README_sdxl.md           |  131 ++
 examples/t2i_adapter/requirements.txt         |    8 +
 .../t2i_adapter/train_t2i_adapter_sdxl.py     | 1276 +++++++++++++++++
 6 files changed, 1561 insertions(+)
 create mode 100644 docs/source/en/training/t2i_adapters.md
 create mode 100644 examples/t2i_adapter/README.md
 create mode 100644 examples/t2i_adapter/README_sdxl.md
 create mode 100644 examples/t2i_adapter/requirements.txt
 create mode 100644 examples/t2i_adapter/train_t2i_adapter_sdxl.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index b85459f71aae..a2a08b52db42 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -102,6 +102,8 @@
       title: InstructPix2Pix Training
     - local: training/custom_diffusion
       title: Custom Diffusion
+    - local: training/t2i_adapters
+      title: T2I-Adapters
     title: Training
   - sections:
     - local: using-diffusers/other-modalities
diff --git a/docs/source/en/training/t2i_adapters.md b/docs/source/en/training/t2i_adapters.md
new file mode 100644
index 000000000000..08a4dfaf4599
--- /dev/null
+++ b/docs/source/en/training/t2i_adapters.md
@@ -0,0 +1,143 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# T2I-Adapters for Stable Diffusion XL (SDXL)
+
+The `train_t2i_adapter_sdxl.py` script (as shown below) shows how to implement the [T2I-Adapter training procedure](https://hf.co/papers/2302.08453) for [Stable Diffusion XL](https://huggingface.co/papers/2307.01952).
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the `examples/t2i_adapter` folder and run
+```bash
+pip install -r requirements_sdxl.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell (e.g., a notebook)
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups. 
+
+## Circle filling dataset
+
+The original dataset is hosted in the [ControlNet repo](https://huggingface.co/lllyasviel/ControlNet/blob/main/training/fill50k.zip). We re-uploaded it to be compatible with `datasets` [here](https://huggingface.co/datasets/fusing/fill50k). Note that `datasets` handles dataloading within the training script.
+
+## Training
+
+Our training examples use two test conditioning images. They can be downloaded by running
+
+```sh
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png
+
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
+```
+
+Then run `huggingface-cli login` to log into your Hugging Face account. This is needed to be able to push the trained T2IAdapter parameters to Hugging Face Hub.
+
+```bash
+export MODEL_DIR="stabilityai/stable-diffusion-xl-base-1.0"
+export OUTPUT_DIR="path to save model"
+
+accelerate launch train_t2i_adapter_sdxl.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=fusing/fill50k \
+ --mixed_precision="fp16" \
+ --resolution=1024 \
+ --learning_rate=1e-5 \
+ --max_train_steps=15000 \
+ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+ --validation_steps=100 \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=4 \
+ --report_to="wandb" \
+ --seed=42 \
+ --push_to_hub
+```
+
+To better track our training experiments, we're using the following flags in the command above:
+
+* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
+* `validation_image`, `validation_prompt`, and `validation_steps` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected. 
+
+Our experiments were conducted on a single 40GB A100 GPU.
+
+### Inference
+
+Once training is done, we can perform inference like so:
+
+```python
+from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, EulerAncestralDiscreteSchedulerTest
+from diffusers.utils import load_image
+import torch
+
+base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
+adapter_path = "path to adapter"
+
+adapter = T2IAdapter.from_pretrained(adapter_path, torch_dtype=torch.float16)
+pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
+    base_model_path, adapter=adapter, torch_dtype=torch.float16
+)
+
+# speed up diffusion process with faster scheduler and memory optimization
+pipe.scheduler = EulerAncestralDiscreteSchedulerTest.from_config(pipe.scheduler.config)
+# remove following line if xformers is not installed or when using Torch 2.0.
+pipe.enable_xformers_memory_efficient_attention()
+# memory optimization.
+pipe.enable_model_cpu_offload()
+
+control_image = load_image("./conditioning_image_1.png")
+prompt = "pale golden rod circle with old lace background"
+
+# generate image
+generator = torch.manual_seed(0)
+image = pipe(
+    prompt, num_inference_steps=20, generator=generator, image=control_image
+).images[0]
+image.save("./output.png")
+```
+
+## Notes
+
+### Specifying a better VAE
+
+SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
diff --git a/examples/t2i_adapter/README.md b/examples/t2i_adapter/README.md
new file mode 100644
index 000000000000..7d7491950d0e
--- /dev/null
+++ b/examples/t2i_adapter/README.md
@@ -0,0 +1 @@
+We don't yet support training T2I-Adapters on Stable Diffusion yet. For training T2I-Adapters on Stable Diffusion XL, refer [here](./README_sdxl.md).
\ No newline at end of file
diff --git a/examples/t2i_adapter/README_sdxl.md b/examples/t2i_adapter/README_sdxl.md
new file mode 100644
index 000000000000..03053c85d8a5
--- /dev/null
+++ b/examples/t2i_adapter/README_sdxl.md
@@ -0,0 +1,131 @@
+# T2I-Adapter training example for Stable Diffusion XL (SDXL)
+
+The `train_t2i_adapter_sdxl.py` script shows how to implement the [T2I-Adapter training procedure](https://hf.co/papers/2302.08453) for [Stable Diffusion XL](https://huggingface.co/papers/2307.01952).
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the `examples/t2i_adapter` folder and run
+```bash
+pip install -r requirements_sdxl.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell (e.g., a notebook)
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups. 
+
+## Circle filling dataset
+
+The original dataset is hosted in the [ControlNet repo](https://huggingface.co/lllyasviel/ControlNet/blob/main/training/fill50k.zip). We re-uploaded it to be compatible with `datasets` [here](https://huggingface.co/datasets/fusing/fill50k). Note that `datasets` handles dataloading within the training script.
+
+## Training
+
+Our training examples use two test conditioning images. They can be downloaded by running
+
+```sh
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png
+
+wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
+```
+
+Then run `huggingface-cli login` to log into your Hugging Face account. This is needed to be able to push the trained T2IAdapter parameters to Hugging Face Hub.
+
+```bash
+export MODEL_DIR="stabilityai/stable-diffusion-xl-base-1.0"
+export OUTPUT_DIR="path to save model"
+
+accelerate launch train_t2i_adapter_sdxl.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=fusing/fill50k \
+ --mixed_precision="fp16" \
+ --resolution=1024 \
+ --learning_rate=1e-5 \
+ --max_train_steps=15000 \
+ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+ --validation_steps=100 \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=4 \
+ --report_to="wandb" \
+ --seed=42 \
+ --push_to_hub
+```
+
+To better track our training experiments, we're using the following flags in the command above:
+
+* `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
+* `validation_image`, `validation_prompt`, and `validation_steps` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected. 
+
+Our experiments were conducted on a single 40GB A100 GPU.
+
+### Inference
+
+Once training is done, we can perform inference like so:
+
+```python
+from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, EulerAncestralDiscreteSchedulerTest
+from diffusers.utils import load_image
+import torch
+
+base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
+adapter_path = "path to adapter"
+
+adapter = T2IAdapter.from_pretrained(adapter_path, torch_dtype=torch.float16)
+pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
+    base_model_path, adapter=adapter, torch_dtype=torch.float16
+)
+
+# speed up diffusion process with faster scheduler and memory optimization
+pipe.scheduler = EulerAncestralDiscreteSchedulerTest.from_config(pipe.scheduler.config)
+# remove following line if xformers is not installed or when using Torch 2.0.
+pipe.enable_xformers_memory_efficient_attention()
+# memory optimization.
+pipe.enable_model_cpu_offload()
+
+control_image = load_image("./conditioning_image_1.png")
+prompt = "pale golden rod circle with old lace background"
+
+# generate image
+generator = torch.manual_seed(0)
+image = pipe(
+    prompt, num_inference_steps=20, generator=generator, image=control_image
+).images[0]
+image.save("./output.png")
+```
+
+## Notes
+
+### Specifying a better VAE
+
+SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
diff --git a/examples/t2i_adapter/requirements.txt b/examples/t2i_adapter/requirements.txt
new file mode 100644
index 000000000000..2955535b1927
--- /dev/null
+++ b/examples/t2i_adapter/requirements.txt
@@ -0,0 +1,8 @@
+transformers>=4.25.1
+accelerate>=0.16.0
+safetensors
+datasets
+torchvision
+ftfy
+tensorboard
+wandb
\ No newline at end of file
diff --git a/examples/t2i_adapter/train_t2i_adapter_sdxl.py b/examples/t2i_adapter/train_t2i_adapter_sdxl.py
new file mode 100644
index 000000000000..b09c7706a25b
--- /dev/null
+++ b/examples/t2i_adapter/train_t2i_adapter_sdxl.py
@@ -0,0 +1,1276 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import functools
+import gc
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import accelerate
+import numpy as np
+import torch
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    EulerDiscreteScheduler,
+    StableDiffusionXLAdapterPipeline,
+    T2IAdapter,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+MAX_SEQ_LENGTH = 77
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.21.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+
+
+def log_validation(vae, unet, adapter, args, accelerator, weight_dtype, step):
+    logger.info("Running validation... ")
+
+    adapter = accelerator.unwrap_model(adapter)
+
+    pipeline = StableDiffusionXLAdapterPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        vae=vae,
+        unet=unet,
+        adapter=adapter,
+        revision=args.revision,
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    if len(args.validation_image) == len(args.validation_prompt):
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_image) == 1:
+        validation_images = args.validation_image * len(args.validation_prompt)
+        validation_prompts = args.validation_prompt
+    elif len(args.validation_prompt) == 1:
+        validation_images = args.validation_image
+        validation_prompts = args.validation_prompt * len(args.validation_image)
+    else:
+        raise ValueError(
+            "number of `args.validation_image` and `args.validation_prompt` should be checked in `parse_args`"
+        )
+
+    image_logs = []
+
+    for validation_prompt, validation_image in zip(validation_prompts, validation_images):
+        validation_image = Image.open(validation_image).convert("RGB")
+        validation_image = validation_image.resize((args.resolution, args.resolution))
+
+        images = []
+
+        for _ in range(args.num_validation_images):
+            with torch.autocast("cuda"):
+                image = pipeline(
+                    prompt=validation_prompt, image=validation_image, num_inference_steps=20, generator=generator
+                ).images[0]
+            images.append(image)
+
+        image_logs.append(
+            {"validation_image": validation_image, "images": images, "validation_prompt": validation_prompt}
+        )
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                validation_image = log["validation_image"]
+
+                formatted_images = []
+
+                formatted_images.append(np.asarray(validation_image))
+
+                for image in images:
+                    formatted_images.append(np.asarray(image))
+
+                formatted_images = np.stack(formatted_images)
+
+                tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            formatted_images = []
+
+            for log in image_logs:
+                images = log["images"]
+                validation_prompt = log["validation_prompt"]
+                validation_image = log["validation_image"]
+
+                formatted_images.append(wandb.Image(validation_image, caption="adapter conditioning"))
+
+                for image in images:
+                    image = wandb.Image(image, caption=validation_prompt)
+                    formatted_images.append(image)
+
+            tracker.log({"validation": formatted_images})
+        else:
+            logger.warn(f"image logging not implemented for {tracker.name}")
+
+        del pipeline
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        return image_logs
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_folder=None):
+    img_str = ""
+    if image_logs is not None:
+        img_str = "You can find some example images below.\n"
+        for i, log in enumerate(image_logs):
+            images = log["images"]
+            validation_prompt = log["validation_prompt"]
+            validation_image = log["validation_image"]
+            validation_image.save(os.path.join(repo_folder, "image_control.png"))
+            img_str += f"prompt: {validation_prompt}\n"
+            images = [validation_image] + images
+            image_grid(images, 1, len(images)).save(os.path.join(repo_folder, f"images_{i}.png"))
+            img_str += f"![images_{i})](./images_{i}.png)\n"
+
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+tags:
+- stable-diffusion-xl
+- stable-diffusion-xl-diffusers
+- text-to-image
+- diffusers
+- t2iadapter
+inference: true
+---
+    """
+    model_card = f"""
+# t2iadapter-{repo_id}
+
+These are t2iadapter weights trained on {base_model} with new type of conditioning.
+{img_str}
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a ControlNet training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to an improved VAE to stabilize training. For more details check out: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be"
+            " float32 precision."
+        ),
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="t2iadapter-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--detection_resolution",
+        type=int,
+        default=None,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--crops_coords_top_left_h",
+        type=int,
+        default=0,
+        help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."),
+    )
+    parser.add_argument(
+        "--crops_coords_top_left_w",
+        type=int,
+        default=0,
+        help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+            "instructions."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=3,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=1,
+        help=("Number of subprocesses to use for data loading."),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing the target image."
+    )
+    parser.add_argument(
+        "--conditioning_image_column",
+        type=str,
+        default="conditioning_image",
+        help="The column of the dataset containing the adapter conditioning image.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        nargs="+",
+        help=(
+            "A set of prompts evaluated every `--validation_steps` and logged to `--report_to`."
+            " Provide either a matching number of `--validation_image`s, a single `--validation_image`"
+            " to be used with all prompts, or a single prompt that will be used with all `--validation_image`s."
+        ),
+    )
+    parser.add_argument(
+        "--validation_image",
+        type=str,
+        default=None,
+        help=(
+            "A set of paths to the t2iadapter conditioning image be evaluated every `--validation_steps`"
+            " and logged to `--report_to`. Provide either a matching number of `--validation_prompt`s, a"
+            " a single `--validation_prompt` to be used with all `--validation_image`s, or a single"
+            " `--validation_image` that will be used with all `--validation_prompt`s."
+        ),
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images to be generated for each `--validation_image`, `--validation_prompt` pair",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`"
+            " and logging the images."
+        ),
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="sd_xl_train_t2iadapter",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--train_data_dir`")
+
+    if args.dataset_name is not None and args.train_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--train_data_dir`")
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    if args.validation_prompt is not None and args.validation_image is None:
+        raise ValueError("`--validation_image` must be set if `--validation_prompt` is set")
+
+    if args.validation_prompt is None and args.validation_image is not None:
+        raise ValueError("`--validation_prompt` must be set if `--validation_image` is set")
+
+    if (
+        args.validation_image is not None
+        and args.validation_prompt is not None
+        and len(args.validation_image) != 1
+        and len(args.validation_prompt) != 1
+        and len(args.validation_image) != len(args.validation_prompt)
+    ):
+        raise ValueError(
+            "Must provide either 1 `--validation_image`, 1 `--validation_prompt`,"
+            " or the same number of `--validation_prompt`s and `--validation_image`s"
+        )
+
+    if args.resolution % 8 != 0:
+        raise ValueError(
+            "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the t2iadapter encoder."
+        )
+
+    return args
+
+
+def get_train_dataset(args, accelerator):
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        if args.train_data_dir is not None:
+            dataset = load_dataset(
+                args.train_data_dir,
+                cache_dir=args.cache_dir,
+            )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    if args.image_column is None:
+        image_column = column_names[0]
+        logger.info(f"image column defaulting to {image_column}")
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    if args.caption_column is None:
+        caption_column = column_names[1]
+        logger.info(f"caption column defaulting to {caption_column}")
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    if args.conditioning_image_column is None:
+        conditioning_image_column = column_names[2]
+        logger.info(f"conditioning image column defaulting to {conditioning_image_column}")
+    else:
+        conditioning_image_column = args.conditioning_image_column
+        if conditioning_image_column not in column_names:
+            raise ValueError(
+                f"`--conditioning_image_column` value '{args.conditioning_image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+
+    with accelerator.main_process_first():
+        train_dataset = dataset["train"].shuffle(seed=args.seed)
+        if args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(args.max_train_samples))
+    return train_dataset
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train=True):
+    prompt_embeds_list = []
+
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+
+    with torch.no_grad():
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                captions,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = text_encoder(
+                text_input_ids.to(text_encoder.device),
+                output_hidden_states=True,
+            )
+
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+            prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def prepare_train_dataset(dataset, accelerator):
+    image_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    conditioning_image_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
+            transforms.ToTensor(),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[args.image_column]]
+        images = [image_transforms(image) for image in images]
+
+        conditioning_images = [image.convert("RGB") for image in examples[args.conditioning_image_column]]
+        conditioning_images = [conditioning_image_transforms(image) for image in conditioning_images]
+
+        examples["pixel_values"] = images
+        examples["conditioning_pixel_values"] = conditioning_images
+
+        return examples
+
+    with accelerator.main_process_first():
+        dataset = dataset.with_transform(preprocess_train)
+
+    return dataset
+
+
+def collate_fn(examples):
+    pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    conditioning_pixel_values = torch.stack([example["conditioning_pixel_values"] for example in examples])
+    conditioning_pixel_values = conditioning_pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    prompt_ids = torch.stack([torch.tensor(example["prompt_embeds"]) for example in examples])
+
+    add_text_embeds = torch.stack([torch.tensor(example["text_embeds"]) for example in examples])
+    add_time_ids = torch.stack([torch.tensor(example["time_ids"]) for example in examples])
+
+    return {
+        "pixel_values": pixel_values,
+        "conditioning_pixel_values": conditioning_pixel_values,
+        "prompt_ids": prompt_ids,
+        "unet_added_conditions": {"text_embeds": add_text_embeds, "time_ids": add_time_ids},
+    }
+
+
+def main(args):
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+                token=args.hub_token,
+                private=True,
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, use_fast=False
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer_2", revision=args.revision, use_fast=False
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = EulerDiscreteScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+
+    logger.info("Initializing t2iadapter weights from unet")
+    t2iadapter = T2IAdapter(
+        in_channels=3,
+        channels=(320, 640, 1280, 1280),
+        num_res_blocks=2,
+        downscale_factor=16,
+        adapter_type="full_adapter_xl",
+    )
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            i = len(weights) - 1
+
+            while len(weights) > 0:
+                weights.pop()
+                model = models[i]
+
+                sub_dir = "t2iadapter"
+                model.save_pretrained(os.path.join(output_dir, sub_dir))
+
+                i -= 1
+
+        def load_model_hook(models, input_dir):
+            while len(models) > 0:
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = T2IAdapter.from_pretrained(os.path.join(input_dir, "t2iadapter"))
+
+                if args.control_type != "style":
+                    model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    t2iadapter.train()
+    unet.train()
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warn(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        " Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training, copy of the weights should still be float32."
+    )
+
+    if accelerator.unwrap_model(t2iadapter).dtype != torch.float32:
+        raise ValueError(
+            f"Controlnet loaded as datatype {accelerator.unwrap_model(t2iadapter).dtype}. {low_precision_error_string}"
+        )
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = t2iadapter.parameters()
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae, unet and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    if args.pretrained_vae_model_name_or_path is not None:
+        vae.to(accelerator.device, dtype=weight_dtype)
+    else:
+        vae.to(accelerator.device, dtype=torch.float32)
+    unet.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    # Here, we compute not just the text embeddings but also the additional embeddings
+    # needed for the SD XL UNet to operate.
+    def compute_embeddings(batch, proportion_empty_prompts, text_encoders, tokenizers, is_train=True):
+        original_size = (args.resolution, args.resolution)
+        target_size = (args.resolution, args.resolution)
+        crops_coords_top_left = (args.crops_coords_top_left_h, args.crops_coords_top_left_w)
+        prompt_batch = batch[args.caption_column]
+
+        prompt_embeds, pooled_prompt_embeds = encode_prompt(
+            prompt_batch, text_encoders, tokenizers, proportion_empty_prompts, is_train
+        )
+        add_text_embeds = pooled_prompt_embeds
+
+        # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids])
+
+        prompt_embeds = prompt_embeds.to(accelerator.device)
+        add_text_embeds = add_text_embeds.to(accelerator.device)
+        add_time_ids = add_time_ids.repeat(len(prompt_batch), 1)
+        add_time_ids = add_time_ids.to(accelerator.device, dtype=prompt_embeds.dtype)
+        unet_added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        return {"prompt_embeds": prompt_embeds, **unet_added_cond_kwargs}
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    # Let's first compute all the embeddings so that we can free up the text encoders
+    # from memory.
+    text_encoders = [text_encoder_one, text_encoder_two]
+    tokenizers = [tokenizer_one, tokenizer_two]
+    train_dataset = get_train_dataset(args, accelerator)
+    compute_embeddings_fn = functools.partial(
+        compute_embeddings,
+        proportion_empty_prompts=args.proportion_empty_prompts,
+        text_encoders=text_encoders,
+        tokenizers=tokenizers,
+    )
+    with accelerator.main_process_first():
+        from datasets.fingerprint import Hasher
+
+        # fingerprint used by the cache for the other processes to load the result
+        # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401
+        new_fingerprint = Hasher.hash(args)
+        train_dataset = train_dataset.map(compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint)
+
+    # Then get the training dataset ready to be passed to the dataloader.
+    train_dataset = prepare_train_dataset(train_dataset, accelerator)
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=args.max_train_steps,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    t2iadapter, optimizer, lr_scheduler = accelerator.prepare(t2iadapter, optimizer, lr_scheduler)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+
+        # tensorboard cannot handle list types for config
+        tracker_config.pop("validation_prompt")
+        tracker_config.pop("validation_image")
+
+        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    image_logs = None
+    for epoch in range(first_epoch, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(t2iadapter):
+                if args.pretrained_vae_model_name_or_path is not None:
+                    pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
+                else:
+                    pixel_values = batch["pixel_values"]
+
+                # encode pixel values with batch size of at most 8 to avoid OOM
+                latents = []
+                for i in range(0, pixel_values.shape[0], 8):
+                    latents.append(vae.encode(pixel_values[i : i + 8]).latent_dist.sample())
+                latents = torch.cat(latents, dim=0)
+                latents = latents * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    latents = latents.to(weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+
+                # Cubic sampling to sample a random timestep for each image.
+                # For more details about why cubic sampling is used, refer to section 3.4 of https://arxiv.org/abs/2302.08453
+                timesteps = torch.rand((bsz,), device=latents.device)
+                timesteps = (1 - timesteps**3) * noise_scheduler.config.num_train_timesteps
+                timesteps = timesteps.long().to(noise_scheduler.timesteps.dtype)
+                timesteps = timesteps.clamp(0, noise_scheduler.config.num_train_timesteps - 1)
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Scale the noisy latents for the UNet
+                sigmas = get_sigmas(timesteps, len(noisy_latents.shape), noisy_latents.dtype)
+                inp_noisy_latents = noisy_latents / ((sigmas**2 + 1) ** 0.5)
+
+                # Adapter conditioning.
+                t2iadapter_image = batch["conditioning_pixel_values"].to(dtype=weight_dtype)
+                down_block_additional_residuals = t2iadapter(t2iadapter_image)
+                down_block_additional_residuals = [
+                    sample.to(dtype=weight_dtype) for sample in down_block_additional_residuals
+                ]
+
+                # Predict the noise residual
+                model_pred = unet(
+                    inp_noisy_latents,
+                    timesteps,
+                    encoder_hidden_states=batch["prompt_ids"],
+                    added_cond_kwargs=batch["unet_added_conditions"],
+                    down_block_additional_residuals=down_block_additional_residuals,
+                ).sample
+
+                # Denoise the latents
+                denoised_latents = model_pred * (-sigmas) + noisy_latents
+                weighing = sigmas**-2.0
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = latents  # we are computing loss against denoise latents
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # MSE loss
+                loss = torch.mean(
+                    (weighing.float() * (denoised_latents.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    dim=1,
+                )
+                loss = loss.mean()
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = t2iadapter.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+                    if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                        image_logs = log_validation(
+                            vae,
+                            unet,
+                            t2iadapter,
+                            args,
+                            accelerator,
+                            weight_dtype,
+                            global_step,
+                        )
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        t2iadapter = accelerator.unwrap_model(t2iadapter)
+        t2iadapter.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                image_logs=image_logs,
+                base_model=args.pretrained_model_name_or_path,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

From d0cf681a1ff3dc2c394c40d877e8fcaf7caa8080 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 8 Sep 2023 19:45:39 +0530
Subject: [PATCH 04/37] [Tests] add: tests for t2i adapter training. (#4947)

add: tests for t2i adapter training.
---
 .../t2i_adapter/train_t2i_adapter_sdxl.py     | 27 +++++++++++++------
 examples/test_examples.py                     | 19 +++++++++++++
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/examples/t2i_adapter/train_t2i_adapter_sdxl.py b/examples/t2i_adapter/train_t2i_adapter_sdxl.py
index b09c7706a25b..3d846f42f649 100644
--- a/examples/t2i_adapter/train_t2i_adapter_sdxl.py
+++ b/examples/t2i_adapter/train_t2i_adapter_sdxl.py
@@ -245,6 +245,13 @@ def parse_args(input_args=None):
         default=None,
         help="Path to an improved VAE to stabilize training. For more details check out: https://github.com/huggingface/diffusers/pull/4038.",
     )
+    parser.add_argument(
+        "--adapter_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained adapter model or model identifier from huggingface.co/models."
+        " If not specified adapter weights are initialized w.r.t the configurations of SDXL.",
+    )
     parser.add_argument(
         "--revision",
         type=str,
@@ -840,14 +847,18 @@ def main(args):
         args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
     )
 
-    logger.info("Initializing t2iadapter weights from unet")
-    t2iadapter = T2IAdapter(
-        in_channels=3,
-        channels=(320, 640, 1280, 1280),
-        num_res_blocks=2,
-        downscale_factor=16,
-        adapter_type="full_adapter_xl",
-    )
+    if args.adapter_model_name_or_path:
+        logger.info("Loading existing adapter weights.")
+        t2iadapter = T2IAdapter.from_pretrained(args.adapter_model_name_or_path)
+    else:
+        logger.info("Initializing t2iadapter weights.")
+        t2iadapter = T2IAdapter(
+            in_channels=3,
+            channels=(320, 640, 1280, 1280),
+            num_res_blocks=2,
+            downscale_factor=16,
+            adapter_type="full_adapter_xl",
+        )
 
     # `accelerate` 0.16.0 will have better support for customized saving
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 4da3cbad4899..89e866231e89 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -1528,6 +1528,25 @@ def test_controlnet_sdxl(self):
 
             self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
 
+    def test_t2i_adapter_sdxl(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/t2i_adapter/train_t2i_adapter_sdxl.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-xl-pipe
+            --adapter_model_name_or_path=hf-internal-testing/tiny-adapter
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=9
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
+
     def test_custom_diffusion_checkpointing_checkpoints_total_limit(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             test_args = f"""

From d73e6ad050ee4831bc367d45a3af9750e1204dae Mon Sep 17 00:00:00 2001
From: Will Berman <wlbberman@gmail.com>
Date: Fri, 8 Sep 2023 10:30:06 -0700
Subject: [PATCH 05/37] guard save model hooks to only execute on main process
 (#4929)

---
 examples/controlnet/train_controlnet.py       | 15 +++---
 examples/controlnet/train_controlnet_sdxl.py  | 15 +++---
 examples/dreambooth/train_dreambooth.py       | 11 +++--
 examples/dreambooth/train_dreambooth_lora.py  | 39 +++++++--------
 .../dreambooth/train_dreambooth_lora_sdxl.py  | 47 ++++++++++---------
 .../train_instruct_pix2pix.py                 | 13 ++---
 .../train_instruct_pix2pix_sdxl.py            | 13 ++---
 .../controlnet/train_controlnet_webdataset.py | 15 +++---
 .../text_to_image/train_text_to_image.py      | 13 ++---
 .../train_unconditional.py                    | 13 ++---
 examples/text_to_image/train_text_to_image.py | 13 ++---
 .../train_text_to_image_lora_sdxl.py          | 47 ++++++++++---------
 .../text_to_image/train_text_to_image_sdxl.py | 13 ++---
 .../train_unconditional.py                    | 13 ++---
 14 files changed, 147 insertions(+), 133 deletions(-)

diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py
index 2b972d1db785..22af44445e22 100644
--- a/examples/controlnet/train_controlnet.py
+++ b/examples/controlnet/train_controlnet.py
@@ -785,16 +785,17 @@ def main(args):
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
         # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
         def save_model_hook(models, weights, output_dir):
-            i = len(weights) - 1
+            if accelerator.is_main_process:
+                i = len(weights) - 1
 
-            while len(weights) > 0:
-                weights.pop()
-                model = models[i]
+                while len(weights) > 0:
+                    weights.pop()
+                    model = models[i]
 
-                sub_dir = "controlnet"
-                model.save_pretrained(os.path.join(output_dir, sub_dir))
+                    sub_dir = "controlnet"
+                    model.save_pretrained(os.path.join(output_dir, sub_dir))
 
-                i -= 1
+                    i -= 1
 
         def load_model_hook(models, input_dir):
             while len(models) > 0:
diff --git a/examples/controlnet/train_controlnet_sdxl.py b/examples/controlnet/train_controlnet_sdxl.py
index 40c77ca56d69..54a9ddd23dc8 100644
--- a/examples/controlnet/train_controlnet_sdxl.py
+++ b/examples/controlnet/train_controlnet_sdxl.py
@@ -840,16 +840,17 @@ def main(args):
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
         # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
         def save_model_hook(models, weights, output_dir):
-            i = len(weights) - 1
+            if accelerator.is_main_process:
+                i = len(weights) - 1
 
-            while len(weights) > 0:
-                weights.pop()
-                model = models[i]
+                while len(weights) > 0:
+                    weights.pop()
+                    model = models[i]
 
-                sub_dir = "controlnet"
-                model.save_pretrained(os.path.join(output_dir, sub_dir))
+                    sub_dir = "controlnet"
+                    model.save_pretrained(os.path.join(output_dir, sub_dir))
 
-                i -= 1
+                    i -= 1
 
         def load_model_hook(models, input_dir):
             while len(models) > 0:
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index fef162a83fce..2e3a5baabf7e 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -920,12 +920,13 @@ def main(args):
 
     # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
     def save_model_hook(models, weights, output_dir):
-        for model in models:
-            sub_dir = "unet" if isinstance(model, type(accelerator.unwrap_model(unet))) else "text_encoder"
-            model.save_pretrained(os.path.join(output_dir, sub_dir))
+        if accelerator.is_main_process:
+            for model in models:
+                sub_dir = "unet" if isinstance(model, type(accelerator.unwrap_model(unet))) else "text_encoder"
+                model.save_pretrained(os.path.join(output_dir, sub_dir))
 
-            # make sure to pop weight so that corresponding model is not saved again
-            weights.pop()
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
 
     def load_model_hook(models, input_dir):
         while len(models) > 0:
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index 8b46d2143bd5..394f19ed6510 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -894,27 +894,28 @@ def main(args):
 
     # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
     def save_model_hook(models, weights, output_dir):
-        # there are only two options here. Either are just the unet attn processor layers
-        # or there are the unet and text encoder atten layers
-        unet_lora_layers_to_save = None
-        text_encoder_lora_layers_to_save = None
-
-        for model in models:
-            if isinstance(model, type(accelerator.unwrap_model(unet))):
-                unet_lora_layers_to_save = unet_attn_processors_state_dict(model)
-            elif isinstance(model, type(accelerator.unwrap_model(text_encoder))):
-                text_encoder_lora_layers_to_save = text_encoder_lora_state_dict(model)
-            else:
-                raise ValueError(f"unexpected save model: {model.__class__}")
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder atten layers
+            unet_lora_layers_to_save = None
+            text_encoder_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(accelerator.unwrap_model(unet))):
+                    unet_lora_layers_to_save = unet_attn_processors_state_dict(model)
+                elif isinstance(model, type(accelerator.unwrap_model(text_encoder))):
+                    text_encoder_lora_layers_to_save = text_encoder_lora_state_dict(model)
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
 
-            # make sure to pop weight so that corresponding model is not saved again
-            weights.pop()
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
 
-        LoraLoaderMixin.save_lora_weights(
-            output_dir,
-            unet_lora_layers=unet_lora_layers_to_save,
-            text_encoder_lora_layers=text_encoder_lora_layers_to_save,
-        )
+            LoraLoaderMixin.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_lora_layers_to_save,
+            )
 
     def load_model_hook(models, input_dir):
         unet_ = None
diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py
index 247d111c06e2..88ae8e4359c5 100644
--- a/examples/dreambooth/train_dreambooth_lora_sdxl.py
+++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py
@@ -798,31 +798,32 @@ def main(args):
 
     # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
     def save_model_hook(models, weights, output_dir):
-        # there are only two options here. Either are just the unet attn processor layers
-        # or there are the unet and text encoder atten layers
-        unet_lora_layers_to_save = None
-        text_encoder_one_lora_layers_to_save = None
-        text_encoder_two_lora_layers_to_save = None
-
-        for model in models:
-            if isinstance(model, type(accelerator.unwrap_model(unet))):
-                unet_lora_layers_to_save = unet_attn_processors_state_dict(model)
-            elif isinstance(model, type(accelerator.unwrap_model(text_encoder_one))):
-                text_encoder_one_lora_layers_to_save = text_encoder_lora_state_dict(model)
-            elif isinstance(model, type(accelerator.unwrap_model(text_encoder_two))):
-                text_encoder_two_lora_layers_to_save = text_encoder_lora_state_dict(model)
-            else:
-                raise ValueError(f"unexpected save model: {model.__class__}")
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder atten layers
+            unet_lora_layers_to_save = None
+            text_encoder_one_lora_layers_to_save = None
+            text_encoder_two_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(accelerator.unwrap_model(unet))):
+                    unet_lora_layers_to_save = unet_attn_processors_state_dict(model)
+                elif isinstance(model, type(accelerator.unwrap_model(text_encoder_one))):
+                    text_encoder_one_lora_layers_to_save = text_encoder_lora_state_dict(model)
+                elif isinstance(model, type(accelerator.unwrap_model(text_encoder_two))):
+                    text_encoder_two_lora_layers_to_save = text_encoder_lora_state_dict(model)
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
 
-            # make sure to pop weight so that corresponding model is not saved again
-            weights.pop()
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
 
-        StableDiffusionXLPipeline.save_lora_weights(
-            output_dir,
-            unet_lora_layers=unet_lora_layers_to_save,
-            text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
-            text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save,
-        )
+            StableDiffusionXLPipeline.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
+                text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save,
+            )
 
     def load_model_hook(models, input_dir):
         unet_ = None
diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix.py b/examples/instruct_pix2pix/train_instruct_pix2pix.py
index fc0a347e37db..afd5bd355e5d 100644
--- a/examples/instruct_pix2pix/train_instruct_pix2pix.py
+++ b/examples/instruct_pix2pix/train_instruct_pix2pix.py
@@ -485,14 +485,15 @@ def main():
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
         # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
         def save_model_hook(models, weights, output_dir):
-            if args.use_ema:
-                ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
 
-            for i, model in enumerate(models):
-                model.save_pretrained(os.path.join(output_dir, "unet"))
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
 
-                # make sure to pop weight so that corresponding model is not saved again
-                weights.pop()
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
 
         def load_model_hook(models, input_dir):
             if args.use_ema:
diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py b/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py
index b7ac105c2419..2c7d839e4346 100644
--- a/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py
+++ b/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py
@@ -528,14 +528,15 @@ def main():
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
         # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
         def save_model_hook(models, weights, output_dir):
-            if args.use_ema:
-                ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
 
-            for i, model in enumerate(models):
-                model.save_pretrained(os.path.join(output_dir, "unet"))
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
 
-                # make sure to pop weight so that corresponding model is not saved again
-                weights.pop()
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
 
         def load_model_hook(models, input_dir):
             if args.use_ema:
diff --git a/examples/research_projects/controlnet/train_controlnet_webdataset.py b/examples/research_projects/controlnet/train_controlnet_webdataset.py
index 9d732abfc454..3122a3952b33 100644
--- a/examples/research_projects/controlnet/train_controlnet_webdataset.py
+++ b/examples/research_projects/controlnet/train_controlnet_webdataset.py
@@ -1010,16 +1010,17 @@ def main(args):
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
         # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
         def save_model_hook(models, weights, output_dir):
-            i = len(weights) - 1
+            if accelerator.is_main_process:
+                i = len(weights) - 1
 
-            while len(weights) > 0:
-                weights.pop()
-                model = models[i]
+                while len(weights) > 0:
+                    weights.pop()
+                    model = models[i]
 
-                sub_dir = "controlnet"
-                model.save_pretrained(os.path.join(output_dir, sub_dir))
+                    sub_dir = "controlnet"
+                    model.save_pretrained(os.path.join(output_dir, sub_dir))
 
-                i -= 1
+                    i -= 1
 
         def load_model_hook(models, input_dir):
             while len(models) > 0:
diff --git a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
index 0cd915b423c7..2548c3a286a6 100644
--- a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
+++ b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
@@ -552,14 +552,15 @@ def compute_snr(timesteps):
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
         # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
         def save_model_hook(models, weights, output_dir):
-            if args.use_ema:
-                ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
 
-            for i, model in enumerate(models):
-                model.save_pretrained(os.path.join(output_dir, "unet"))
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
 
-                # make sure to pop weight so that corresponding model is not saved again
-                weights.pop()
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
 
         def load_model_hook(models, input_dir):
             if args.use_ema:
diff --git a/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py b/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py
index 12ff40bbd680..ba5ccd238fdc 100644
--- a/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py
+++ b/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py
@@ -313,14 +313,15 @@ def main(args):
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
         # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
         def save_model_hook(models, weights, output_dir):
-            if args.use_ema:
-                ema_model.save_pretrained(os.path.join(output_dir, "unet_ema"))
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_model.save_pretrained(os.path.join(output_dir, "unet_ema"))
 
-            for i, model in enumerate(models):
-                model.save_pretrained(os.path.join(output_dir, "unet"))
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
 
-                # make sure to pop weight so that corresponding model is not saved again
-                weights.pop()
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
 
         def load_model_hook(models, input_dir):
             if args.use_ema:
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index 96b50147600a..542ee61de2e6 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -629,14 +629,15 @@ def compute_snr(timesteps):
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
         # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
         def save_model_hook(models, weights, output_dir):
-            if args.use_ema:
-                ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
 
-            for i, model in enumerate(models):
-                model.save_pretrained(os.path.join(output_dir, "unet"))
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
 
-                # make sure to pop weight so that corresponding model is not saved again
-                weights.pop()
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
 
         def load_model_hook(models, input_dir):
             if args.use_ema:
diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py
index aa69c822ca7f..a01e68340be7 100644
--- a/examples/text_to_image/train_text_to_image_lora_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py
@@ -669,31 +669,32 @@ def compute_snr(timesteps):
 
     # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
     def save_model_hook(models, weights, output_dir):
-        # there are only two options here. Either are just the unet attn processor layers
-        # or there are the unet and text encoder atten layers
-        unet_lora_layers_to_save = None
-        text_encoder_one_lora_layers_to_save = None
-        text_encoder_two_lora_layers_to_save = None
-
-        for model in models:
-            if isinstance(model, type(accelerator.unwrap_model(unet))):
-                unet_lora_layers_to_save = unet_attn_processors_state_dict(model)
-            elif isinstance(model, type(accelerator.unwrap_model(text_encoder_one))):
-                text_encoder_one_lora_layers_to_save = text_encoder_lora_state_dict(model)
-            elif isinstance(model, type(accelerator.unwrap_model(text_encoder_two))):
-                text_encoder_two_lora_layers_to_save = text_encoder_lora_state_dict(model)
-            else:
-                raise ValueError(f"unexpected save model: {model.__class__}")
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder atten layers
+            unet_lora_layers_to_save = None
+            text_encoder_one_lora_layers_to_save = None
+            text_encoder_two_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(accelerator.unwrap_model(unet))):
+                    unet_lora_layers_to_save = unet_attn_processors_state_dict(model)
+                elif isinstance(model, type(accelerator.unwrap_model(text_encoder_one))):
+                    text_encoder_one_lora_layers_to_save = text_encoder_lora_state_dict(model)
+                elif isinstance(model, type(accelerator.unwrap_model(text_encoder_two))):
+                    text_encoder_two_lora_layers_to_save = text_encoder_lora_state_dict(model)
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
 
-            # make sure to pop weight so that corresponding model is not saved again
-            weights.pop()
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
 
-        StableDiffusionXLPipeline.save_lora_weights(
-            output_dir,
-            unet_lora_layers=unet_lora_layers_to_save,
-            text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
-            text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save,
-        )
+            StableDiffusionXLPipeline.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
+                text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save,
+            )
 
     def load_model_hook(models, input_dir):
         unet_ = None
diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py
index d698243f7423..c6a1b907a35c 100644
--- a/examples/text_to_image/train_text_to_image_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_sdxl.py
@@ -651,14 +651,15 @@ def compute_snr(timesteps):
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
         # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
         def save_model_hook(models, weights, output_dir):
-            if args.use_ema:
-                ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
 
-            for i, model in enumerate(models):
-                model.save_pretrained(os.path.join(output_dir, "unet"))
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
 
-                # make sure to pop weight so that corresponding model is not saved again
-                weights.pop()
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
 
         def load_model_hook(models, input_dir):
             if args.use_ema:
diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py
index bfa48269026a..40e71e718631 100644
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -309,14 +309,15 @@ def main(args):
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
         # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
         def save_model_hook(models, weights, output_dir):
-            if args.use_ema:
-                ema_model.save_pretrained(os.path.join(output_dir, "unet_ema"))
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_model.save_pretrained(os.path.join(output_dir, "unet_ema"))
 
-            for i, model in enumerate(models):
-                model.save_pretrained(os.path.join(output_dir, "unet"))
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
 
-                # make sure to pop weight so that corresponding model is not saved again
-                weights.pop()
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
 
         def load_model_hook(models, input_dir):
             if args.use_ema:

From 914c513ee07026299614f30804e0e28bf079a459 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sat, 9 Sep 2023 06:52:11 +0530
Subject: [PATCH 06/37] [Docs] add t2i adapter entry to overview of training
 scripts. (#4946)

add t2i adapter entry to overview of training scripts.
---
 docs/source/en/training/overview.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/training/overview.md b/docs/source/en/training/overview.md
index c5cea3bb0a96..c6fe339eda73 100644
--- a/docs/source/en/training/overview.md
+++ b/docs/source/en/training/overview.md
@@ -34,13 +34,16 @@ If you feel like another important example should exist, we are more than happy
 Training examples show how to pretrain or fine-tune diffusion models for a variety of tasks. Currently we support:
 
 - [Unconditional Training](./unconditional_training)
-- [Text-to-Image Training](./text2image)
+- [Text-to-Image Training](./text2image)<sup>*</sup>
 - [Text Inversion](./text_inversion)
-- [Dreambooth](./dreambooth)
-- [LoRA Support](./lora)
-- [ControlNet](./controlnet)
-- [InstructPix2Pix](./instructpix2pix)
+- [Dreambooth](./dreambooth)<sup>*</sup>
+- [LoRA Support](./lora)<sup>*</sup>
+- [ControlNet](./controlnet)<sup>*</sup>
+- [InstructPix2Pix](./instructpix2pix)<sup>*</sup>
 - [Custom Diffusion](./custom_diffusion)
+- [T2I-Adapters](./t2i_adapters)<sup>*</sup>
+
+<sup>*</sup>: Supports [Stable Diffusion XL](../api/pipelines/stable_diffusion/stable_diffusion_xl).
 
 If possible, please [install xFormers](../optimization/xformers) for memory efficient attention. This could help make your training faster and less memory intensive.
 
@@ -54,6 +57,7 @@ If possible, please [install xFormers](../optimization/xformers) for memory effi
 | [**ControlNet**](./controlnet) | ✅ | ✅ | - |
 | [**InstructPix2Pix**](./instructpix2pix) | ✅ | ✅ | - |
 | [**Custom Diffusion**](./custom_diffusion) | ✅ | ✅ | - |
+| [**T2I Adapters**](./t2i_adapters) | ✅ | ✅ | - |
 
 ## Community
 

From 2ab170499eaaf7adfa24a80e0e2717c916f598f1 Mon Sep 17 00:00:00 2001
From: Will Berman <wlbberman@gmail.com>
Date: Fri, 8 Sep 2023 19:54:59 -0700
Subject: [PATCH 07/37] =?UTF-8?q?Temp=20Revert=20"[Core]=20better=20suppor?=
 =?UTF-8?q?t=20offloading=20when=20side=20loading=20is=20enabled=E2=80=A6?=
 =?UTF-8?q?=20(#4927)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Revert "[Core] better support offloading when side loading is enabled. (#4855)"

This reverts commit e4b8e7928b2c1972b37af67c64ccc67e42578f8c.
---
 src/diffusers/loaders.py                      | 43 --------------
 .../pipeline_controlnet_inpaint_sd_xl.py      | 26 ---------
 .../controlnet/pipeline_controlnet_sd_xl.py   | 26 ---------
 .../pipeline_stable_diffusion_xl.py           | 26 ---------
 .../pipeline_stable_diffusion_xl_img2img.py   | 26 ---------
 .../pipeline_stable_diffusion_xl_inpaint.py   | 26 ---------
 tests/models/test_lora_layers.py              | 56 +------------------
 .../stable_diffusion/test_stable_diffusion.py | 50 -----------------
 8 files changed, 1 insertion(+), 278 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 1de899cad927..e4d8f1a3a5eb 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -45,7 +45,6 @@
 
 if is_accelerate_available():
     from accelerate import init_empty_weights
-    from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
     from accelerate.utils import set_module_tensor_to_device
 
 logger = logging.get_logger(__name__)
@@ -779,21 +778,6 @@ def load_textual_inversion(
                 f" `{self.load_textual_inversion.__name__}`"
             )
 
-        # Remove any existing hooks.
-        is_model_cpu_offload = False
-        is_sequential_cpu_offload = False
-        recursive = False
-        for _, component in self.components.items():
-            if isinstance(component, nn.Module):
-                if hasattr(component, "_hf_hook"):
-                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
-                    logger.info(
-                        "Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again."
-                    )
-                    recursive = is_sequential_cpu_offload
-                    remove_hook_from_module(component, recurse=recursive)
-
         cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
@@ -947,12 +931,6 @@ def load_textual_inversion(
         for token_id, embedding in token_ids_and_embeddings:
             text_encoder.get_input_embeddings().weight.data[token_id] = embedding
 
-        # offload back
-        if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
-        elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
-
 
 class LoraLoaderMixin:
     r"""
@@ -984,21 +962,6 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
             kwargs (`dict`, *optional*):
                 See [`~loaders.LoraLoaderMixin.lora_state_dict`].
         """
-        # Remove any existing hooks.
-        is_model_cpu_offload = False
-        is_sequential_cpu_offload = False
-        recurive = False
-        for _, component in self.components.items():
-            if isinstance(component, nn.Module):
-                if hasattr(component, "_hf_hook"):
-                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
-                    logger.info(
-                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
-                    )
-                    recurive = is_sequential_cpu_offload
-                    remove_hook_from_module(component, recurse=recurive)
-
         state_dict, network_alphas = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
         self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
         self.load_lora_into_text_encoder(
@@ -1008,12 +971,6 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
             lora_scale=self.lora_scale,
         )
 
-        # Offload back.
-        if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
-        elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
-
     @classmethod
     def lora_state_dict(
         cls,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
index c64204501b97..b20d1f0c636e 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -1549,26 +1549,6 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
         # We could have accessed the unet config from `lora_state_dict()` too. We pass
         # it here explicitly to be able to tell that it's coming from an SDXL
         # pipeline.
-
-        # Remove any existing hooks.
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
-        else:
-            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
-
-        is_model_cpu_offload = False
-        is_sequential_cpu_offload = False
-        recursive = False
-        for _, component in self.components.items():
-            if isinstance(component, torch.nn.Module):
-                if hasattr(component, "_hf_hook"):
-                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
-                    logger.info(
-                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
-                    )
-                    recursive = is_sequential_cpu_offload
-                    remove_hook_from_module(component, recurse=recursive)
         state_dict, network_alphas = self.lora_state_dict(
             pretrained_model_name_or_path_or_dict,
             unet_config=self.unet.config,
@@ -1596,12 +1576,6 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
                 lora_scale=self.lora_scale,
             )
 
-        # Offload back.
-        if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
-        elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
-
     @classmethod
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.save_lora_weights
     def save_lora_weights(
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
index ef6b54e81548..6f2b36ba6976 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -1216,26 +1216,6 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
         # We could have accessed the unet config from `lora_state_dict()` too. We pass
         # it here explicitly to be able to tell that it's coming from an SDXL
         # pipeline.
-
-        # Remove any existing hooks.
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
-        else:
-            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
-
-        is_model_cpu_offload = False
-        is_sequential_cpu_offload = False
-        recursive = False
-        for _, component in self.components.items():
-            if isinstance(component, torch.nn.Module):
-                if hasattr(component, "_hf_hook"):
-                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
-                    logger.info(
-                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
-                    )
-                    recursive = is_sequential_cpu_offload
-                    remove_hook_from_module(component, recurse=recursive)
         state_dict, network_alphas = self.lora_state_dict(
             pretrained_model_name_or_path_or_dict,
             unet_config=self.unet.config,
@@ -1263,12 +1243,6 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
                 lora_scale=self.lora_scale,
             )
 
-        # Offload back.
-        if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
-        elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
-
     @classmethod
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.save_lora_weights
     def save_lora_weights(
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 7b7755085ed6..459b47de7ea1 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -922,26 +922,6 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
         # We could have accessed the unet config from `lora_state_dict()` too. We pass
         # it here explicitly to be able to tell that it's coming from an SDXL
         # pipeline.
-
-        # Remove any existing hooks.
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
-        else:
-            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
-
-        is_model_cpu_offload = False
-        is_sequential_cpu_offload = False
-        recursive = False
-        for _, component in self.components.items():
-            if isinstance(component, torch.nn.Module):
-                if hasattr(component, "_hf_hook"):
-                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
-                    logger.info(
-                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
-                    )
-                    recursive = is_sequential_cpu_offload
-                    remove_hook_from_module(component, recurse=recursive)
         state_dict, network_alphas = self.lora_state_dict(
             pretrained_model_name_or_path_or_dict,
             unet_config=self.unet.config,
@@ -969,12 +949,6 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
                 lora_scale=self.lora_scale,
             )
 
-        # Offload back.
-        if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
-        elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
-
     @classmethod
     def save_lora_weights(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 04902234d54e..b9e2b263b893 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -1072,26 +1072,6 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
         # We could have accessed the unet config from `lora_state_dict()` too. We pass
         # it here explicitly to be able to tell that it's coming from an SDXL
         # pipeline.
-
-        # Remove any existing hooks.
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
-        else:
-            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
-
-        is_model_cpu_offload = False
-        is_sequential_cpu_offload = False
-        recursive = False
-        for _, component in self.components.items():
-            if isinstance(component, torch.nn.Module):
-                if hasattr(component, "_hf_hook"):
-                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
-                    logger.info(
-                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
-                    )
-                    recursive = is_sequential_cpu_offload
-                    remove_hook_from_module(component, recurse=recursive)
         state_dict, network_alphas = self.lora_state_dict(
             pretrained_model_name_or_path_or_dict,
             unet_config=self.unet.config,
@@ -1119,12 +1099,6 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
                 lora_scale=self.lora_scale,
             )
 
-        # Offload back.
-        if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
-        elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
-
     @classmethod
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.save_lora_weights
     def save_lora_weights(
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 1d86dff702ef..0b00e31a0a50 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -1392,26 +1392,6 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
         # We could have accessed the unet config from `lora_state_dict()` too. We pass
         # it here explicitly to be able to tell that it's coming from an SDXL
         # pipeline.
-
-        # Remove any existing hooks.
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
-        else:
-            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
-
-        is_model_cpu_offload = False
-        is_sequential_cpu_offload = False
-        recursive = False
-        for _, component in self.components.items():
-            if isinstance(component, torch.nn.Module):
-                if hasattr(component, "_hf_hook"):
-                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
-                    logger.info(
-                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
-                    )
-                    recursive = is_sequential_cpu_offload
-                    remove_hook_from_module(component, recurse=recursive)
         state_dict, network_alphas = self.lora_state_dict(
             pretrained_model_name_or_path_or_dict,
             unet_config=self.unet.config,
@@ -1439,12 +1419,6 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
                 lora_scale=self.lora_scale,
             )
 
-        # Offload back.
-        if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
-        elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
-
     @classmethod
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.save_lora_weights
     def save_lora_weights(
diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py
index c49ea7f2d960..6b3498ae5f41 100644
--- a/tests/models/test_lora_layers.py
+++ b/tests/models/test_lora_layers.py
@@ -1081,42 +1081,6 @@ def test_a1111(self):
 
         self.assertTrue(np.allclose(images, expected, atol=1e-3))
 
-    def test_a1111_with_model_cpu_offload(self):
-        generator = torch.Generator().manual_seed(0)
-
-        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
-        pipe.enable_model_cpu_offload()
-        lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
-        lora_filename = "light_and_shadow.safetensors"
-        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-
-        images = pipe(
-            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
-        ).images
-
-        images = images[0, -3:, -3:, -1].flatten()
-        expected = np.array([0.3636, 0.3708, 0.3694, 0.3679, 0.3829, 0.3677, 0.3692, 0.3688, 0.3292])
-
-        self.assertTrue(np.allclose(images, expected, atol=1e-3))
-
-    def test_a1111_with_sequential_cpu_offload(self):
-        generator = torch.Generator().manual_seed(0)
-
-        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
-        pipe.enable_sequential_cpu_offload()
-        lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
-        lora_filename = "light_and_shadow.safetensors"
-        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-
-        images = pipe(
-            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
-        ).images
-
-        images = images[0, -3:, -3:, -1].flatten()
-        expected = np.array([0.3636, 0.3708, 0.3694, 0.3679, 0.3829, 0.3677, 0.3692, 0.3688, 0.3292])
-
-        self.assertTrue(np.allclose(images, expected, atol=1e-3))
-
     def test_kohya_sd_v15_with_higher_dimensions(self):
         generator = torch.Generator().manual_seed(0)
 
@@ -1293,10 +1257,10 @@ def test_sdxl_1_0_lora(self):
         generator = torch.Generator().manual_seed(0)
 
         pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
-        pipe.enable_model_cpu_offload()
         lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
         lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
         pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe.enable_model_cpu_offload()
 
         images = pipe(
             "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
@@ -1447,21 +1411,3 @@ def test_sdxl_1_0_fuse_unfuse_all(self):
         assert state_dicts_almost_equal(text_encoder_1_sd, pipe.text_encoder.state_dict())
         assert state_dicts_almost_equal(text_encoder_2_sd, pipe.text_encoder_2.state_dict())
         assert state_dicts_almost_equal(unet_sd, pipe.unet.state_dict())
-
-    def test_sdxl_1_0_lora_with_sequential_cpu_offloading(self):
-        generator = torch.Generator().manual_seed(0)
-
-        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
-        pipe.enable_sequential_cpu_offload()
-        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
-        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
-        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-
-        images = pipe(
-            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
-        ).images
-
-        images = images[0, -3:, -3:, -1].flatten()
-        expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
-
-        self.assertTrue(np.allclose(images, expected, atol=1e-3))
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 31de557a0ac3..7935a63eceaa 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -1019,56 +1019,6 @@ def test_stable_diffusion_textual_inversion(self):
         max_diff = np.abs(expected_image - image).max()
         assert max_diff < 8e-1
 
-    def test_stable_diffusion_textual_inversion_with_model_cpu_offload(self):
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        pipe.enable_model_cpu_offload()
-        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")
-
-        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
-        a111_file_neg = hf_hub_download(
-            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
-        )
-        pipe.load_textual_inversion(a111_file)
-        pipe.load_textual_inversion(a111_file_neg)
-
-        generator = torch.Generator(device="cpu").manual_seed(1)
-
-        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
-        neg_prompt = "Style-Winter-neg"
-
-        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
-        )
-
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 8e-1
-
-    def test_stable_diffusion_textual_inversion_with_sequential_cpu_offload(self):
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        pipe.enable_sequential_cpu_offload()
-        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")
-
-        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
-        a111_file_neg = hf_hub_download(
-            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
-        )
-        pipe.load_textual_inversion(a111_file)
-        pipe.load_textual_inversion(a111_file_neg)
-
-        generator = torch.Generator(device="cpu").manual_seed(1)
-
-        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
-        neg_prompt = "Style-Winter-neg"
-
-        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
-        )
-
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 8e-1
-
     @require_torch_2
     def test_stable_diffusion_compile(self):
         seed = 0

From 4191ddee1189c988f25b10c08af0a45e96e76893 Mon Sep 17 00:00:00 2001
From: Will Berman <wlbberman@gmail.com>
Date: Sun, 10 Sep 2023 23:49:46 -0700
Subject: [PATCH 08/37] Revert revert and install accelerate main (#4963)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Revert "Temp Revert "[Core] better support offloading when side loading is enabled… (#4927)"

This reverts commit 2ab170499eaaf7adfa24a80e0e2717c916f598f1.

* tests: install accelerate from main
---
 .github/workflows/pr_tests.yml                |  1 +
 .github/workflows/push_tests.yml              |  1 +
 .github/workflows/push_tests_mps.yml          |  2 +-
 src/diffusers/loaders.py                      | 43 ++++++++++++++
 .../pipeline_controlnet_inpaint_sd_xl.py      | 26 +++++++++
 .../controlnet/pipeline_controlnet_sd_xl.py   | 26 +++++++++
 .../pipeline_stable_diffusion_xl.py           | 26 +++++++++
 .../pipeline_stable_diffusion_xl_img2img.py   | 26 +++++++++
 .../pipeline_stable_diffusion_xl_inpaint.py   | 26 +++++++++
 tests/models/test_lora_layers.py              | 56 ++++++++++++++++++-
 .../stable_diffusion/test_stable_diffusion.py | 50 +++++++++++++++++
 11 files changed, 281 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index defd418edcef..42c0c8e42252 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -67,6 +67,7 @@ jobs:
       run: |
         apt-get update && apt-get install libsndfile1-dev libgl1 -y
         python -m pip install -e .[quality,test]
+        python -m pip install git+https://github.com/huggingface/accelerate.git
 
     - name: Environment
       run: |
diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 5ec8dbdc4026..a13519ec5876 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -63,6 +63,7 @@ jobs:
       run: |
         apt-get update && apt-get install libsndfile1-dev libgl1 -y
         python -m pip install -e .[quality,test]
+        python -m pip install git+https://github.com/huggingface/accelerate.git
 
     - name: Environment
       run: |
diff --git a/.github/workflows/push_tests_mps.yml b/.github/workflows/push_tests_mps.yml
index 6b95815f1ea5..c92aa6426d55 100644
--- a/.github/workflows/push_tests_mps.yml
+++ b/.github/workflows/push_tests_mps.yml
@@ -40,7 +40,7 @@ jobs:
         ${CONDA_RUN} python -m pip install --upgrade pip
         ${CONDA_RUN} python -m pip install -e .[quality,test]
         ${CONDA_RUN} python -m pip install torch torchvision torchaudio
-        ${CONDA_RUN} python -m pip install accelerate --upgrade
+        ${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate.git
         ${CONDA_RUN} python -m pip install transformers --upgrade
 
     - name: Environment
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index e4d8f1a3a5eb..1de899cad927 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -45,6 +45,7 @@
 
 if is_accelerate_available():
     from accelerate import init_empty_weights
+    from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
     from accelerate.utils import set_module_tensor_to_device
 
 logger = logging.get_logger(__name__)
@@ -778,6 +779,21 @@ def load_textual_inversion(
                 f" `{self.load_textual_inversion.__name__}`"
             )
 
+        # Remove any existing hooks.
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        recursive = False
+        for _, component in self.components.items():
+            if isinstance(component, nn.Module):
+                if hasattr(component, "_hf_hook"):
+                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again."
+                    )
+                    recursive = is_sequential_cpu_offload
+                    remove_hook_from_module(component, recurse=recursive)
+
         cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
@@ -931,6 +947,12 @@ def load_textual_inversion(
         for token_id, embedding in token_ids_and_embeddings:
             text_encoder.get_input_embeddings().weight.data[token_id] = embedding
 
+        # offload back
+        if is_model_cpu_offload:
+            self.enable_model_cpu_offload()
+        elif is_sequential_cpu_offload:
+            self.enable_sequential_cpu_offload()
+
 
 class LoraLoaderMixin:
     r"""
@@ -962,6 +984,21 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
             kwargs (`dict`, *optional*):
                 See [`~loaders.LoraLoaderMixin.lora_state_dict`].
         """
+        # Remove any existing hooks.
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        recurive = False
+        for _, component in self.components.items():
+            if isinstance(component, nn.Module):
+                if hasattr(component, "_hf_hook"):
+                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                    )
+                    recurive = is_sequential_cpu_offload
+                    remove_hook_from_module(component, recurse=recurive)
+
         state_dict, network_alphas = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
         self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
         self.load_lora_into_text_encoder(
@@ -971,6 +1008,12 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
             lora_scale=self.lora_scale,
         )
 
+        # Offload back.
+        if is_model_cpu_offload:
+            self.enable_model_cpu_offload()
+        elif is_sequential_cpu_offload:
+            self.enable_sequential_cpu_offload()
+
     @classmethod
     def lora_state_dict(
         cls,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
index b20d1f0c636e..c64204501b97 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -1549,6 +1549,26 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
         # We could have accessed the unet config from `lora_state_dict()` too. We pass
         # it here explicitly to be able to tell that it's coming from an SDXL
         # pipeline.
+
+        # Remove any existing hooks.
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+        else:
+            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
+
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        recursive = False
+        for _, component in self.components.items():
+            if isinstance(component, torch.nn.Module):
+                if hasattr(component, "_hf_hook"):
+                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                    )
+                    recursive = is_sequential_cpu_offload
+                    remove_hook_from_module(component, recurse=recursive)
         state_dict, network_alphas = self.lora_state_dict(
             pretrained_model_name_or_path_or_dict,
             unet_config=self.unet.config,
@@ -1576,6 +1596,12 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
                 lora_scale=self.lora_scale,
             )
 
+        # Offload back.
+        if is_model_cpu_offload:
+            self.enable_model_cpu_offload()
+        elif is_sequential_cpu_offload:
+            self.enable_sequential_cpu_offload()
+
     @classmethod
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.save_lora_weights
     def save_lora_weights(
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
index 6f2b36ba6976..ef6b54e81548 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -1216,6 +1216,26 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
         # We could have accessed the unet config from `lora_state_dict()` too. We pass
         # it here explicitly to be able to tell that it's coming from an SDXL
         # pipeline.
+
+        # Remove any existing hooks.
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+        else:
+            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
+
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        recursive = False
+        for _, component in self.components.items():
+            if isinstance(component, torch.nn.Module):
+                if hasattr(component, "_hf_hook"):
+                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                    )
+                    recursive = is_sequential_cpu_offload
+                    remove_hook_from_module(component, recurse=recursive)
         state_dict, network_alphas = self.lora_state_dict(
             pretrained_model_name_or_path_or_dict,
             unet_config=self.unet.config,
@@ -1243,6 +1263,12 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
                 lora_scale=self.lora_scale,
             )
 
+        # Offload back.
+        if is_model_cpu_offload:
+            self.enable_model_cpu_offload()
+        elif is_sequential_cpu_offload:
+            self.enable_sequential_cpu_offload()
+
     @classmethod
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.save_lora_weights
     def save_lora_weights(
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 459b47de7ea1..7b7755085ed6 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -922,6 +922,26 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
         # We could have accessed the unet config from `lora_state_dict()` too. We pass
         # it here explicitly to be able to tell that it's coming from an SDXL
         # pipeline.
+
+        # Remove any existing hooks.
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+        else:
+            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
+
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        recursive = False
+        for _, component in self.components.items():
+            if isinstance(component, torch.nn.Module):
+                if hasattr(component, "_hf_hook"):
+                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                    )
+                    recursive = is_sequential_cpu_offload
+                    remove_hook_from_module(component, recurse=recursive)
         state_dict, network_alphas = self.lora_state_dict(
             pretrained_model_name_or_path_or_dict,
             unet_config=self.unet.config,
@@ -949,6 +969,12 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
                 lora_scale=self.lora_scale,
             )
 
+        # Offload back.
+        if is_model_cpu_offload:
+            self.enable_model_cpu_offload()
+        elif is_sequential_cpu_offload:
+            self.enable_sequential_cpu_offload()
+
     @classmethod
     def save_lora_weights(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index b9e2b263b893..04902234d54e 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -1072,6 +1072,26 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
         # We could have accessed the unet config from `lora_state_dict()` too. We pass
         # it here explicitly to be able to tell that it's coming from an SDXL
         # pipeline.
+
+        # Remove any existing hooks.
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+        else:
+            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
+
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        recursive = False
+        for _, component in self.components.items():
+            if isinstance(component, torch.nn.Module):
+                if hasattr(component, "_hf_hook"):
+                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                    )
+                    recursive = is_sequential_cpu_offload
+                    remove_hook_from_module(component, recurse=recursive)
         state_dict, network_alphas = self.lora_state_dict(
             pretrained_model_name_or_path_or_dict,
             unet_config=self.unet.config,
@@ -1099,6 +1119,12 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
                 lora_scale=self.lora_scale,
             )
 
+        # Offload back.
+        if is_model_cpu_offload:
+            self.enable_model_cpu_offload()
+        elif is_sequential_cpu_offload:
+            self.enable_sequential_cpu_offload()
+
     @classmethod
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.save_lora_weights
     def save_lora_weights(
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 0b00e31a0a50..1d86dff702ef 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -1392,6 +1392,26 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
         # We could have accessed the unet config from `lora_state_dict()` too. We pass
         # it here explicitly to be able to tell that it's coming from an SDXL
         # pipeline.
+
+        # Remove any existing hooks.
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+        else:
+            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
+
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        recursive = False
+        for _, component in self.components.items():
+            if isinstance(component, torch.nn.Module):
+                if hasattr(component, "_hf_hook"):
+                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                    )
+                    recursive = is_sequential_cpu_offload
+                    remove_hook_from_module(component, recurse=recursive)
         state_dict, network_alphas = self.lora_state_dict(
             pretrained_model_name_or_path_or_dict,
             unet_config=self.unet.config,
@@ -1419,6 +1439,12 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
                 lora_scale=self.lora_scale,
             )
 
+        # Offload back.
+        if is_model_cpu_offload:
+            self.enable_model_cpu_offload()
+        elif is_sequential_cpu_offload:
+            self.enable_sequential_cpu_offload()
+
     @classmethod
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.save_lora_weights
     def save_lora_weights(
diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py
index 6b3498ae5f41..c49ea7f2d960 100644
--- a/tests/models/test_lora_layers.py
+++ b/tests/models/test_lora_layers.py
@@ -1081,6 +1081,42 @@ def test_a1111(self):
 
         self.assertTrue(np.allclose(images, expected, atol=1e-3))
 
+    def test_a1111_with_model_cpu_offload(self):
+        generator = torch.Generator().manual_seed(0)
+
+        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
+        pipe.enable_model_cpu_offload()
+        lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
+        lora_filename = "light_and_shadow.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.3636, 0.3708, 0.3694, 0.3679, 0.3829, 0.3677, 0.3692, 0.3688, 0.3292])
+
+        self.assertTrue(np.allclose(images, expected, atol=1e-3))
+
+    def test_a1111_with_sequential_cpu_offload(self):
+        generator = torch.Generator().manual_seed(0)
+
+        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
+        pipe.enable_sequential_cpu_offload()
+        lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
+        lora_filename = "light_and_shadow.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.3636, 0.3708, 0.3694, 0.3679, 0.3829, 0.3677, 0.3692, 0.3688, 0.3292])
+
+        self.assertTrue(np.allclose(images, expected, atol=1e-3))
+
     def test_kohya_sd_v15_with_higher_dimensions(self):
         generator = torch.Generator().manual_seed(0)
 
@@ -1257,10 +1293,10 @@ def test_sdxl_1_0_lora(self):
         generator = torch.Generator().manual_seed(0)
 
         pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.enable_model_cpu_offload()
         lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
         lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
         pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-        pipe.enable_model_cpu_offload()
 
         images = pipe(
             "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
@@ -1411,3 +1447,21 @@ def test_sdxl_1_0_fuse_unfuse_all(self):
         assert state_dicts_almost_equal(text_encoder_1_sd, pipe.text_encoder.state_dict())
         assert state_dicts_almost_equal(text_encoder_2_sd, pipe.text_encoder_2.state_dict())
         assert state_dicts_almost_equal(unet_sd, pipe.unet.state_dict())
+
+    def test_sdxl_1_0_lora_with_sequential_cpu_offloading(self):
+        generator = torch.Generator().manual_seed(0)
+
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.enable_sequential_cpu_offload()
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
+
+        self.assertTrue(np.allclose(images, expected, atol=1e-3))
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 7935a63eceaa..31de557a0ac3 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -1019,6 +1019,56 @@ def test_stable_diffusion_textual_inversion(self):
         max_diff = np.abs(expected_image - image).max()
         assert max_diff < 8e-1
 
+    def test_stable_diffusion_textual_inversion_with_model_cpu_offload(self):
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        pipe.enable_model_cpu_offload()
+        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")
+
+        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
+        a111_file_neg = hf_hub_download(
+            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
+        )
+        pipe.load_textual_inversion(a111_file)
+        pipe.load_textual_inversion(a111_file_neg)
+
+        generator = torch.Generator(device="cpu").manual_seed(1)
+
+        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
+        neg_prompt = "Style-Winter-neg"
+
+        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
+        )
+
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 8e-1
+
+    def test_stable_diffusion_textual_inversion_with_sequential_cpu_offload(self):
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        pipe.enable_sequential_cpu_offload()
+        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")
+
+        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
+        a111_file_neg = hf_hub_download(
+            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
+        )
+        pipe.load_textual_inversion(a111_file)
+        pipe.load_textual_inversion(a111_file_neg)
+
+        generator = torch.Generator(device="cpu").manual_seed(1)
+
+        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
+        neg_prompt = "Style-Winter-neg"
+
+        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
+        )
+
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 8e-1
+
     @require_torch_2
     def test_stable_diffusion_compile(self):
         seed = 0

From 88735249da94266a433368d2b899e87dc33446c9 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 11 Sep 2023 12:42:53 +0530
Subject: [PATCH 09/37] =?UTF-8?q?[Docs]=20fix:=20minor=20formatting=20in?=
 =?UTF-8?q?=20the=20W=C3=BCrstchen=20=20docs=20(#4965)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix: minor formatting in the docs
---
 docs/source/en/api/pipelines/wuerstchen.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/wuerstchen.md b/docs/source/en/api/pipelines/wuerstchen.md
index 4316bc739ca9..9ead9456bfb3 100644
--- a/docs/source/en/api/pipelines/wuerstchen.md
+++ b/docs/source/en/api/pipelines/wuerstchen.md
@@ -18,6 +18,7 @@ After the initial paper release, we have improved numerous things in the archite
 - Better quality
 
 We are releasing 3 checkpoints for the text-conditional image generation model (Stage C). Those are: 
+
 - v2-base
 - v2-aesthetic
 - v2-interpolated (50% interpolation between v2-base and v2-aesthetic)
@@ -58,7 +59,7 @@ output = pipeline(
 ).images
 ```
 
-For explanation purposes, we can also initialize the two main pipelines of Würstchen individually. Würstchen consists of 3 stages: Stage C, Stage B, Stage A. They all have different jobs and work only together. When generating text-conditional images, Stage C will first generate the latents in a very compressed latent space. This is what happens in the `prior_pipeline`. Afterwards, the generated latents will be passed to Stage B, which decompresses the latents into a bigger latent space of a VQGAN. These latents can then be decoded by Stage A, which is a VQGAN, into the pixel-space. Stage B & Stage A are both encapsulated in the `decoder_pipeline`. For more details, take a look the [paper](https://huggingface.co/papers/2306.00637).
+For explanation purposes, we can also initialize the two main pipelines of Würstchen individually. Würstchen consists of 3 stages: Stage C, Stage B, Stage A. They all have different jobs and work only together. When generating text-conditional images, Stage C will first generate the latents in a very compressed latent space. This is what happens in the `prior_pipeline`. Afterwards, the generated latents will be passed to Stage B, which decompresses the latents into a bigger latent space of a VQGAN. These latents can then be decoded by Stage A, which is a VQGAN, into the pixel-space. Stage B & Stage A are both encapsulated in the `decoder_pipeline`. For more details, take a look at the [paper](https://huggingface.co/papers/2306.00637).
 
 ```python
 import torch
@@ -97,7 +98,7 @@ decoder_output = decoder_pipeline(
 ```
 
 ## Speed-Up Inference
-You can make use of ``torch.compile`` function and gain a speed-up of about 2-3x:
+You can make use of `torch.compile` function and gain a speed-up of about 2-3x:
 
 ```python
 pipeline.prior = torch.compile(pipeline.prior, mode="reduce-overhead", fullgraph=True)
@@ -105,6 +106,7 @@ pipeline.decoder = torch.compile(pipeline.decoder, mode="reduce-overhead", fullg
 ```
 
 ## Limitations
+
 - Due to the high compression employed by Würstchen, generations can lack a good amount
 of detail. To our human eye, this is especially noticeable in faces, hands etc.
 - **Images can only be generated in 128-pixel steps**, e.g. the next higher resolution 

From b6e0b016cedaf60f51fa9f8d4e38c2324ee40783 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 11 Sep 2023 13:26:22 +0530
Subject: [PATCH 10/37] Lazy Import for Diffusers (#4829)

* initial commit

* move modules to import struct

* add dummy objects and _LazyModule

* add lazy import to schedulers

* clean up unused imports

* lazy import on models module

* lazy import for schedulers module

* add lazy import to pipelines module

* lazy import altdiffusion

* lazy import audio diffusion

* lazy import audioldm

* lazy import consistency model

* lazy import controlnet

* lazy import dance diffusion ddim ddpm

* lazy import deepfloyd

* lazy import kandinksy

* lazy imports

* lazy import semantic diffusion

* lazy imports

* lazy import stable diffusion

* move sd output to its own module

* clean up

* lazy import t2iadapter

* lazy import unclip

* lazy import versatile and vq diffsuion

* lazy import vq diffusion

* helper to fetch objects from modules

* lazy import sdxl

* lazy import txt2vid

* lazy import stochastic karras

* fix model imports

* fix bug

* lazy import

* clean up

* clean up

* fixes for tests

* fixes for tests

* clean up

* remove import of torch_utils from utils module

* clean up

* clean up

* fix mistake import statement

* dedicated modules for exporting and loading

* remove testing utils from utils module

* fixes from  merge conflicts

* Update src/diffusers/pipelines/kandinsky2_2/__init__.py

* fix docs

* fix alt diffusion copied from

* fix check dummies

* fix more docs

* remove accelerate import from utils module

* add type checking

* make style

* fix check dummies

* remove torch import from xformers check

* clean up error message

* fixes after upstream merges

* dummy objects fix

* fix tests

* remove unused module import

---------

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/utilities.md               |  16 +-
 .../en/using-diffusers/reproducibility.md     |  22 +-
 ...p_guided_images_mixing_stable_diffusion.py |   6 +-
 .../clip_guided_stable_diffusion_img2img.py   |   7 +-
 .../ddim_noise_comparative_analysis.py        |   2 +-
 examples/community/lpw_stable_diffusion.py    |   2 +-
 examples/community/lpw_stable_diffusion_xl.py |   2 +-
 examples/community/pipeline_fabric.py         |   2 +-
 examples/community/pipeline_zero1to3.py       |   2 +-
 examples/community/run_onnx_controlnet.py     |   2 +-
 examples/community/run_tensorrt_controlnet.py |   2 +-
 .../stable_diffusion_controlnet_img2img.py    |   2 +-
 .../stable_diffusion_controlnet_inpaint.py    |   2 +-
 ...le_diffusion_controlnet_inpaint_img2img.py |   2 +-
 .../stable_diffusion_controlnet_reference.py  |   3 +-
 examples/community/stable_diffusion_ipex.py   |   2 +-
 .../community/stable_diffusion_reference.py   |   3 +-
 .../community/stable_diffusion_repaint.py     |   2 +-
 .../stable_diffusion_xl_reference.py          |   3 +-
 examples/community/stable_unclip.py           |   3 +-
 .../community/unclip_image_interpolation.py   |   3 +-
 .../community/unclip_text_interpolation.py    |   3 +-
 src/diffusers/__init__.py                     | 824 +++++++++++++-----
 .../experimental/rl/value_guided_sampling.py  |   2 +-
 src/diffusers/models/__init__.py              |  46 +-
 src/diffusers/models/attention.py             |   2 +-
 src/diffusers/models/attention_processor.py   |   3 +-
 src/diffusers/models/autoencoder_asym_kl.py   |   2 +-
 src/diffusers/models/autoencoder_kl.py        |   3 +-
 src/diffusers/models/autoencoder_tiny.py      |   3 +-
 src/diffusers/models/vae.py                   |   3 +-
 src/diffusers/models/vq_model.py              |   3 +-
 src/diffusers/pipelines/__init__.py           | 323 ++++---
 .../pipelines/alt_diffusion/__init__.py       |  59 +-
 .../alt_diffusion/pipeline_alt_diffusion.py   |   3 +-
 .../pipeline_alt_diffusion_img2img.py         |   3 +-
 .../alt_diffusion/pipeline_output.py          |  28 +
 .../pipelines/audio_diffusion/__init__.py     |  20 +-
 .../pipeline_audio_diffusion.py               |   2 +-
 src/diffusers/pipelines/audioldm/__init__.py  |  23 +-
 .../pipelines/audioldm/pipeline_audioldm.py   |   3 +-
 src/diffusers/pipelines/audioldm2/__init__.py |  28 +-
 .../pipelines/audioldm2/pipeline_audioldm2.py |   2 +-
 .../pipelines/consistency_models/__init__.py  |  18 +-
 .../pipeline_consistency_models.py            |   2 +-
 .../pipelines/controlnet/__init__.py          |  52 +-
 .../controlnet/pipeline_controlnet.py         |   3 +-
 .../controlnet/pipeline_controlnet_img2img.py |   3 +-
 .../controlnet/pipeline_controlnet_inpaint.py |   3 +-
 .../pipeline_controlnet_inpaint_sd_xl.py      |   3 +-
 .../controlnet/pipeline_controlnet_sd_xl.py   |   3 +-
 .../pipeline_controlnet_sd_xl_img2img.py      |   3 +-
 .../pipelines/dance_diffusion/__init__.py     |  17 +-
 .../pipeline_dance_diffusion.py               |   3 +-
 src/diffusers/pipelines/ddim/__init__.py      |  16 +-
 src/diffusers/pipelines/ddim/pipeline_ddim.py |   2 +-
 src/diffusers/pipelines/ddpm/__init__.py      |  18 +-
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py |   2 +-
 .../pipelines/deepfloyd_if/__init__.py        |  89 +-
 .../pipelines/deepfloyd_if/pipeline_if.py     |   2 +-
 .../deepfloyd_if/pipeline_if_img2img.py       |   2 +-
 .../pipeline_if_img2img_superresolution.py    |   2 +-
 .../deepfloyd_if/pipeline_if_inpainting.py    |   2 +-
 .../pipeline_if_inpainting_superresolution.py |   2 +-
 .../pipeline_if_superresolution.py            |   2 +-
 .../pipelines/deepfloyd_if/pipeline_output.py |  28 +
 src/diffusers/pipelines/dit/__init__.py       |  16 +-
 src/diffusers/pipelines/dit/pipeline_dit.py   |   2 +-
 src/diffusers/pipelines/kandinsky/__init__.py |  45 +-
 .../pipelines/kandinsky/pipeline_kandinsky.py |   2 +-
 .../kandinsky/pipeline_kandinsky_img2img.py   |   2 +-
 .../kandinsky/pipeline_kandinsky_inpaint.py   |   2 +-
 .../kandinsky/pipeline_kandinsky_prior.py     |   2 +-
 .../pipelines/kandinsky2_2/__init__.py        |  49 +-
 .../kandinsky2_2/pipeline_kandinsky2_2.py     |   2 +-
 .../pipeline_kandinsky2_2_controlnet.py       |   2 +-
 ...ipeline_kandinsky2_2_controlnet_img2img.py |   2 +-
 .../pipeline_kandinsky2_2_img2img.py          |   2 +-
 .../pipeline_kandinsky2_2_inpainting.py       |   2 +-
 .../pipeline_kandinsky2_2_prior.py            |   2 +-
 .../pipeline_kandinsky2_2_prior_emb2emb.py    |   2 +-
 .../pipelines/latent_diffusion/__init__.py    |  34 +-
 .../pipeline_latent_diffusion.py              |   2 +-
 ...peline_latent_diffusion_superresolution.py |   3 +-
 .../latent_diffusion_uncond/__init__.py       |  16 +-
 .../pipeline_latent_diffusion_uncond.py       |   2 +-
 src/diffusers/pipelines/musicldm/__init__.py  |  27 +-
 .../pipelines/musicldm/pipeline_musicldm.py   |   3 +-
 .../pipelines/paint_by_example/__init__.py    |  32 +-
 .../pipeline_paint_by_example.py              |   3 +-
 src/diffusers/pipelines/pipeline_utils.py     |   2 +-
 src/diffusers/pipelines/pndm/__init__.py      |  17 +-
 src/diffusers/pipelines/pndm/pipeline_pndm.py |   2 +-
 src/diffusers/pipelines/repaint/__init__.py   |  16 +-
 .../pipelines/repaint/pipeline_repaint.py     |   3 +-
 .../pipelines/score_sde_ve/__init__.py        |  16 +-
 .../score_sde_ve/pipeline_score_sde_ve.py     |   2 +-
 .../semantic_stable_diffusion/__init__.py     |  56 +-
 .../pipeline_output.py                        |  25 +
 .../pipeline_semantic_stable_diffusion.py     |   3 +-
 src/diffusers/pipelines/shap_e/__init__.py    |  50 +-
 .../pipelines/shap_e/pipeline_shap_e.py       |   2 +-
 .../shap_e/pipeline_shap_e_img2img.py         |   2 +-
 .../spectrogram_diffusion/__init__.py         |  46 +-
 .../pipeline_spectrogram_diffusion.py         |   3 +-
 .../pipelines/stable_diffusion/__init__.py    | 160 ++--
 .../pipeline_cycle_diffusion.py               |   3 +-
 .../stable_diffusion/pipeline_output.py       |  49 ++
 .../pipeline_stable_diffusion.py              |   2 +-
 ...line_stable_diffusion_attend_and_excite.py |   3 +-
 .../pipeline_stable_diffusion_depth2img.py    |   3 +-
 .../pipeline_stable_diffusion_diffedit.py     |   2 +-
 .../pipeline_stable_diffusion_gligen.py       |   2 +-
 ...line_stable_diffusion_gligen_text_image.py |   2 +-
 ...peline_stable_diffusion_image_variation.py |   3 +-
 .../pipeline_stable_diffusion_img2img.py      |   2 +-
 .../pipeline_stable_diffusion_inpaint.py      |   3 +-
 ...ipeline_stable_diffusion_inpaint_legacy.py |  10 +-
 ...eline_stable_diffusion_instruct_pix2pix.py |  10 +-
 .../pipeline_stable_diffusion_k_diffusion.py  |   3 +-
 ...ipeline_stable_diffusion_latent_upscale.py |   3 +-
 .../pipeline_stable_diffusion_ldm3d.py        |   2 +-
 ...pipeline_stable_diffusion_model_editing.py |   3 +-
 .../pipeline_stable_diffusion_panorama.py     |   3 +-
 .../pipeline_stable_diffusion_paradigms.py    |   2 +-
 .../pipeline_stable_diffusion_pix2pix_zero.py |   2 +-
 .../pipeline_stable_diffusion_sag.py          |   3 +-
 .../pipeline_stable_diffusion_upscale.py      |   3 +-
 .../pipeline_stable_unclip.py                 |   2 +-
 .../pipeline_stable_unclip_img2img.py         |   3 +-
 .../pipeline_stable_diffusion_safe.py         |   3 +-
 .../pipelines/stable_diffusion_xl/__init__.py |  49 +-
 .../stable_diffusion_xl/pipeline_output.py    |  21 +
 .../pipeline_stable_diffusion_xl.py           |   2 +-
 .../pipeline_stable_diffusion_xl_img2img.py   |   2 +-
 .../pipeline_stable_diffusion_xl_inpaint.py   |   2 +-
 ...ne_stable_diffusion_xl_instruct_pix2pix.py |   2 +-
 .../stochastic_karras_ve/__init__.py          |  16 +-
 .../pipeline_stochastic_karras_ve.py          |   2 +-
 .../pipelines/t2i_adapter/__init__.py         |  25 +-
 .../pipeline_stable_diffusion_adapter.py      |   2 +-
 .../pipeline_stable_diffusion_xl_adapter.py   |   2 +-
 .../text_to_video_synthesis/__init__.py       |  51 +-
 .../pipeline_output.py                        |  23 +
 .../pipeline_text_to_video_synth.py           |   2 +-
 .../pipeline_text_to_video_synth_img2img.py   |   2 +-
 src/diffusers/pipelines/unclip/__init__.py    |  27 +-
 .../pipelines/unclip/pipeline_unclip.py       |   3 +-
 .../unclip/pipeline_unclip_image_variation.py |   3 +-
 .../pipelines/unidiffuser/__init__.py         |  30 +-
 .../unidiffuser/pipeline_unidiffuser.py       |  10 +-
 .../pipelines/versatile_diffusion/__init__.py |  37 +-
 ...ipeline_versatile_diffusion_dual_guided.py |   3 +-
 ...ine_versatile_diffusion_image_variation.py |   3 +-
 ...eline_versatile_diffusion_text_to_image.py |   3 +-
 .../pipelines/vq_diffusion/__init__.py        |  35 +-
 .../pipelines/wuerstchen/__init__.py          |  44 +-
 .../wuerstchen/modeling_paella_vq_model.py    |   2 +-
 .../wuerstchen/pipeline_wuerstchen.py         |   3 +-
 .../wuerstchen/pipeline_wuerstchen_prior.py   |   2 +-
 src/diffusers/schedulers/__init__.py          | 126 ++-
 .../scheduling_consistency_models.py          |   3 +-
 src/diffusers/schedulers/scheduling_ddim.py   |   3 +-
 .../schedulers/scheduling_ddim_parallel.py    |   3 +-
 src/diffusers/schedulers/scheduling_ddpm.py   |   3 +-
 .../schedulers/scheduling_ddpm_parallel.py    |   3 +-
 .../schedulers/scheduling_ddpm_wuerstchen.py  |   3 +-
 .../scheduling_dpmsolver_multistep.py         |   2 +-
 .../scheduling_dpmsolver_multistep_inverse.py |   2 +-
 .../scheduling_euler_ancestral_discrete.py    |   3 +-
 .../schedulers/scheduling_euler_discrete.py   |   3 +-
 .../scheduling_k_dpm_2_ancestral_discrete.py  |   2 +-
 .../schedulers/scheduling_karras_ve.py        |   3 +-
 .../schedulers/scheduling_repaint.py          |   3 +-
 src/diffusers/schedulers/scheduling_sde_ve.py |   3 +-
 src/diffusers/schedulers/scheduling_sde_vp.py |   2 +-
 src/diffusers/schedulers/scheduling_unclip.py |   3 +-
 src/diffusers/utils/__init__.py               |  28 +-
 src/diffusers/utils/export_utils.py           | 132 +++
 src/diffusers/utils/import_utils.py           |  90 +-
 src/diffusers/utils/loading_utils.py          |  37 +
 tests/models/test_layers_utils.py             |   2 +-
 tests/models/test_lora_layers.py              |   3 +-
 tests/models/test_modeling_common.py          |   3 +-
 tests/models/test_models_prior.py             |   3 +-
 tests/models/test_models_unet_1d.py           |   2 +-
 tests/models/test_models_unet_2d.py           |  10 +-
 tests/models/test_models_unet_2d_condition.py |   8 +-
 tests/models/test_models_unet_3d_condition.py |   9 +-
 tests/models/test_models_vae.py               |  11 +-
 tests/models/test_models_vq.py                |   3 +-
 tests/models/test_unet_2d_blocks.py           |   2 +-
 tests/models/test_unet_blocks_common.py       |   4 +-
 .../altdiffusion/test_alt_diffusion.py        |   3 +-
 .../test_alt_diffusion_img2img.py             |  11 +-
 .../audio_diffusion/test_audio_diffusion.py   |   3 +-
 tests/pipelines/audioldm/test_audioldm.py     |   4 +-
 tests/pipelines/audioldm2/test_audioldm2.py   |   4 +-
 .../test_consistency_models.py                |  10 +-
 tests/pipelines/controlnet/test_controlnet.py |   6 +-
 .../controlnet/test_controlnet_img2img.py     |  12 +-
 .../controlnet/test_controlnet_inpaint.py     |  12 +-
 .../test_controlnet_inpaint_sdxl.py           |   3 +-
 .../controlnet/test_controlnet_sdxl.py        |   4 +-
 .../test_controlnet_sdxl_img2img.py           |   3 +-
 .../controlnet/test_flax_controlnet.py        |   4 +-
 .../dance_diffusion/test_dance_diffusion.py   |   3 +-
 .../pipelines/deepfloyd_if/test_if_img2img.py |   3 +-
 .../test_if_img2img_superresolution.py        |   3 +-
 .../deepfloyd_if/test_if_inpainting.py        |   3 +-
 .../test_if_inpainting_superresolution.py     |   3 +-
 .../deepfloyd_if/test_if_superresolution.py   |   3 +-
 tests/pipelines/dit/test_dit.py               |   4 +-
 tests/pipelines/kandinsky/test_kandinsky.py   |  10 +-
 .../kandinsky/test_kandinsky_combined.py      |   3 +-
 .../kandinsky/test_kandinsky_img2img.py       |  12 +-
 .../kandinsky/test_kandinsky_inpaint.py       |  11 +-
 .../kandinsky/test_kandinsky_prior.py         |   3 +-
 .../pipelines/kandinsky_v22/test_kandinsky.py |  10 +-
 .../kandinsky_v22/test_kandinsky_combined.py  |   3 +-
 .../test_kandinsky_controlnet.py              |  11 +-
 .../test_kandinsky_controlnet_img2img.py      |  11 +-
 .../kandinsky_v22/test_kandinsky_img2img.py   |  11 +-
 .../kandinsky_v22/test_kandinsky_inpaint.py   |  11 +-
 .../kandinsky_v22/test_kandinsky_prior.py     |   3 +-
 .../test_kandinsky_prior_emb2emb.py           |   3 +-
 .../test_latent_diffusion_superresolution.py  |  11 +-
 tests/pipelines/musicldm/test_musicldm.py     |   4 +-
 .../paint_by_example/test_paint_by_example.py |  10 +-
 .../test_semantic_diffusion.py                |   9 +-
 tests/pipelines/shap_e/test_shap_e.py         |   3 +-
 tests/pipelines/shap_e/test_shap_e_img2img.py |   3 +-
 .../test_spectrogram_diffusion.py             |  11 +-
 .../stable_diffusion/test_cycle_diffusion.py  |  12 +-
 .../test_onnx_stable_diffusion_img2img.py     |   2 +-
 .../test_onnx_stable_diffusion_upscale.py     |   2 +-
 .../stable_diffusion/test_stable_diffusion.py |   5 +-
 .../test_stable_diffusion_adapter.py          |  12 +-
 ...test_stable_diffusion_gligen_text_image.py |   2 +-
 .../test_stable_diffusion_image_variation.py  |  12 +-
 .../test_stable_diffusion_img2img.py          |   7 +-
 .../test_stable_diffusion_inpaint.py          |   7 +-
 .../test_stable_diffusion_inpaint_legacy.py   |  13 +-
 ...st_stable_diffusion_instruction_pix2pix.py |  10 +-
 .../test_stable_diffusion_k_diffusion.py      |   3 +-
 .../test_stable_diffusion_ldm3d.py            |   3 +-
 .../test_stable_diffusion_model_editing.py    |   3 +-
 .../test_stable_diffusion_panorama.py         |   3 +-
 .../test_stable_diffusion_paradigms.py        |   3 +-
 .../test_stable_diffusion_pix2pix_zero.py     |  13 +-
 .../test_stable_diffusion_sag.py              |   3 +-
 .../test_stable_diffusion.py                  |   5 +-
 ...test_stable_diffusion_attend_and_excite.py |   9 +-
 .../test_stable_diffusion_depth.py            |   9 +-
 .../test_stable_diffusion_diffedit.py         |  11 +-
 .../test_stable_diffusion_flax.py             |   4 +-
 .../test_stable_diffusion_flax_inpaint.py     |   4 +-
 .../test_stable_diffusion_inpaint.py          |  11 +-
 .../test_stable_diffusion_latent_upscale.py   |  11 +-
 .../test_stable_diffusion_upscale.py          |  11 +-
 .../test_stable_diffusion_v_pred.py           |  10 +-
 .../test_safe_diffusion.py                    |   3 +-
 .../test_stable_diffusion_xl.py               |   3 +-
 .../test_stable_diffusion_xl_adapter.py       |   3 +-
 .../test_stable_diffusion_xl_img2img.py       |   3 +-
 .../test_stable_diffusion_xl_inpaint.py       |   3 +-
 ...stable_diffusion_xl_instruction_pix2pix.py |   3 +-
 tests/pipelines/test_pipelines.py             |  12 +-
 tests/pipelines/test_pipelines_auto.py        |   2 +-
 tests/pipelines/test_pipelines_common.py      |   8 +-
 .../text_to_video/test_text_to_video.py       |  11 +-
 .../text_to_video/test_text_to_video_zero.py  |   2 +-
 .../text_to_video/test_video_to_video.py      |  10 +-
 tests/pipelines/unclip/test_unclip.py         |  11 +-
 .../unclip/test_unclip_image_variation.py     |  12 +-
 .../pipelines/unidiffuser/test_unidiffuser.py |   4 +-
 .../vq_diffusion/test_vq_diffusion.py         |   3 +-
 .../wuerstchen/test_wuerstchen_combined.py    |   3 +-
 .../wuerstchen/test_wuerstchen_decoder.py     |   3 +-
 .../wuerstchen/test_wuerstchen_prior.py       |   8 +-
 tests/schedulers/test_scheduler_dpm_sde.py    |   3 +-
 tests/schedulers/test_scheduler_euler.py      |   2 +-
 .../test_scheduler_euler_ancestral.py         |   2 +-
 tests/schedulers/test_scheduler_heun.py       |   2 +-
 .../test_scheduler_kdpm2_ancestral.py         |   2 +-
 .../test_scheduler_kdpm2_discrete.py          |   2 +-
 tests/schedulers/test_scheduler_lms.py        |   2 +-
 tests/schedulers/test_schedulers.py           |   3 +-
 utils/check_copies.py                         |  10 -
 utils/check_dummies.py                        |  11 +-
 290 files changed, 2885 insertions(+), 1182 deletions(-)
 create mode 100644 src/diffusers/pipelines/alt_diffusion/pipeline_output.py
 create mode 100644 src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
 create mode 100644 src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py
 create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_output.py
 create mode 100644 src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py
 create mode 100644 src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
 create mode 100644 src/diffusers/utils/export_utils.py
 create mode 100644 src/diffusers/utils/loading_utils.py

diff --git a/docs/source/en/api/utilities.md b/docs/source/en/api/utilities.md
index 9edf3e37218a..abc38416053a 100644
--- a/docs/source/en/api/utilities.md
+++ b/docs/source/en/api/utilities.md
@@ -2,30 +2,26 @@
 
 Utility and helper functions for working with 🤗 Diffusers.
 
-## randn_tensor
-
-[[autodoc]] diffusers.utils.randn_tensor
-
 ## numpy_to_pil
 
-[[autodoc]] utils.pil_utils.numpy_to_pil
+[[autodoc]] utils.numpy_to_pil
 
 ## pt_to_pil
 
-[[autodoc]] utils.pil_utils.pt_to_pil
+[[autodoc]] utils.pt_to_pil
 
 ## load_image
 
-[[autodoc]] utils.testing_utils.load_image
+[[autodoc]] utils.load_image
 
 ## export_to_gif
 
-[[autodoc]] utils.testing_utils.export_to_gif
+[[autodoc]] utils.export_to_gif
 
 ## export_to_video
 
-[[autodoc]] utils.testing_utils.export_to_video
+[[autodoc]] utils.export_to_video
 
 ## make_image_grid
 
-[[autodoc]] utils.pil_utils.make_image_grid
\ No newline at end of file
+[[autodoc]] utils.pil_utils.make_image_grid
diff --git a/docs/source/en/using-diffusers/reproducibility.md b/docs/source/en/using-diffusers/reproducibility.md
index b02ca070a1a2..0da760f0192d 100644
--- a/docs/source/en/using-diffusers/reproducibility.md
+++ b/docs/source/en/using-diffusers/reproducibility.md
@@ -28,7 +28,7 @@ This is why it's important to understand how to control sources of randomness in
 
 ## Control randomness
 
-During inference, pipelines rely heavily on random sampling operations which include creating the 
+During inference, pipelines rely heavily on random sampling operations which include creating the
 Gaussian noise tensors to denoise and adding noise to the scheduling step.
 
 Take a look at the tensor values in the [`DDIMPipeline`] after two inference steps:
@@ -47,7 +47,7 @@ image = ddim(num_inference_steps=2, output_type="np").images
 print(np.abs(image).sum())
 ```
 
-Running the code above prints one value, but if you run it again you get a different value. What is going on here? 
+Running the code above prints one value, but if you run it again you get a different value. What is going on here?
 
 Every time the pipeline is run, [`torch.randn`](https://pytorch.org/docs/stable/generated/torch.randn.html) uses a different random seed to create Gaussian noise which is denoised stepwise. This leads to a different result each time it is run, which is great for diffusion pipelines since it generates a different random image each time.
 
@@ -81,16 +81,16 @@ If you run this code example on your specific hardware and PyTorch version, you
 
 <Tip>
 
-💡 It might be a bit unintuitive at first to pass `Generator` objects to the pipeline instead of 
-just integer values representing the seed, but this is the recommended design when dealing with 
-probabilistic models in PyTorch as `Generator`'s are *random states* that can be 
+💡 It might be a bit unintuitive at first to pass `Generator` objects to the pipeline instead of
+just integer values representing the seed, but this is the recommended design when dealing with
+probabilistic models in PyTorch as `Generator`'s are *random states* that can be
 passed to multiple pipelines in a sequence.
 
 </Tip>
 
 ### GPU
 
-Writing a reproducible pipeline on a GPU is a bit trickier, and full reproducibility across different hardware is not guaranteed because matrix multiplication - which diffusion pipelines require a lot of - is less deterministic on a GPU than a CPU. For example, if you run the same code example above on a GPU: 
+Writing a reproducible pipeline on a GPU is a bit trickier, and full reproducibility across different hardware is not guaranteed because matrix multiplication - which diffusion pipelines require a lot of - is less deterministic on a GPU than a CPU. For example, if you run the same code example above on a GPU:
 
 ```python
 import torch
@@ -113,7 +113,7 @@ print(np.abs(image).sum())
 
 The result is not the same even though you're using an identical seed because the GPU uses a different random number generator than the CPU.
 
-To circumvent this problem, 🧨 Diffusers has a [`~diffusers.utils.randn_tensor`] function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The `randn_tensor` function is used everywhere inside the pipeline, allowing the user to **always** pass a CPU `Generator` even if the pipeline is run on a GPU. 
+To circumvent this problem, 🧨 Diffusers has a [`~diffusers.utils.torch_utils.randn_tensor`] function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The `randn_tensor` function is used everywhere inside the pipeline, allowing the user to **always** pass a CPU `Generator` even if the pipeline is run on a GPU.
 
 You'll see the results are much closer now!
 
@@ -139,14 +139,14 @@ print(np.abs(image).sum())
 <Tip>
 
 💡 If reproducibility is important, we recommend always passing a CPU generator.
-The performance loss is often neglectable, and you'll generate much more similar 
+The performance loss is often neglectable, and you'll generate much more similar
 values than if the pipeline had been run on a GPU.
 
 </Tip>
 
-Finally, for more complex pipelines such as [`UnCLIPPipeline`], these are often extremely 
-susceptible to precision error propagation. Don't expect similar results across 
-different GPU hardware or PyTorch versions. In this case, you'll need to run 
+Finally, for more complex pipelines such as [`UnCLIPPipeline`], these are often extremely
+susceptible to precision error propagation. Don't expect similar results across
+different GPU hardware or PyTorch versions. In this case, you'll need to run
 exactly the same hardware and PyTorch version for full reproducibility.
 
 ## Deterministic algorithms
diff --git a/examples/community/clip_guided_images_mixing_stable_diffusion.py b/examples/community/clip_guided_images_mixing_stable_diffusion.py
index 8cf8e595292a..a6b477df6b7f 100644
--- a/examples/community/clip_guided_images_mixing_stable_diffusion.py
+++ b/examples/community/clip_guided_images_mixing_stable_diffusion.py
@@ -19,10 +19,8 @@
     UNet2DConditionModel,
 )
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.utils import (
-    PIL_INTERPOLATION,
-    randn_tensor,
-)
+from diffusers.utils import PIL_INTERPOLATION
+from diffusers.utils.torch_utils import randn_tensor
 
 
 def preprocess(image, w, h):
diff --git a/examples/community/clip_guided_stable_diffusion_img2img.py b/examples/community/clip_guided_stable_diffusion_img2img.py
index a72a5a127c72..ad9ca804058c 100644
--- a/examples/community/clip_guided_stable_diffusion_img2img.py
+++ b/examples/community/clip_guided_stable_diffusion_img2img.py
@@ -19,11 +19,8 @@
     UNet2DConditionModel,
 )
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.utils import (
-    PIL_INTERPOLATION,
-    deprecate,
-    randn_tensor,
-)
+from diffusers.utils import PIL_INTERPOLATION, deprecate
+from diffusers.utils.torch_utils import randn_tensor
 
 
 EXAMPLE_DOC_STRING = """
diff --git a/examples/community/ddim_noise_comparative_analysis.py b/examples/community/ddim_noise_comparative_analysis.py
index c4f51c489ff4..e0784fc5138a 100644
--- a/examples/community/ddim_noise_comparative_analysis.py
+++ b/examples/community/ddim_noise_comparative_analysis.py
@@ -20,7 +20,7 @@
 
 from diffusers.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from diffusers.schedulers import DDIMScheduler
-from diffusers.utils import randn_tensor
+from diffusers.utils.torch_utils import randn_tensor
 
 
 trans = transforms.Compose(
diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index 19975e6ded87..89345a8a5eb3 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -21,8 +21,8 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
 )
+from diffusers.utils.torch_utils import randn_tensor
 
 
 # ------------------------------------------------------------------------------
diff --git a/examples/community/lpw_stable_diffusion_xl.py b/examples/community/lpw_stable_diffusion_xl.py
index abfbfb5aa1c1..2ee44b95ab0a 100644
--- a/examples/community/lpw_stable_diffusion_xl.py
+++ b/examples/community/lpw_stable_diffusion_xl.py
@@ -30,9 +30,9 @@
     is_accelerate_version,
     is_invisible_watermark_available,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from diffusers.utils.torch_utils import randn_tensor
 
 
 if is_invisible_watermark_available():
diff --git a/examples/community/pipeline_fabric.py b/examples/community/pipeline_fabric.py
index 456e69cade13..c5783402b36c 100644
--- a/examples/community/pipeline_fabric.py
+++ b/examples/community/pipeline_fabric.py
@@ -14,6 +14,7 @@
 from typing import List, Optional, Union
 
 import torch
+from diffuser.utils.torch_utils import randn_tensor
 from packaging import version
 from PIL import Image
 from transformers import CLIPTextModel, CLIPTokenizer
@@ -30,7 +31,6 @@
 from diffusers.utils import (
     deprecate,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
 
diff --git a/examples/community/pipeline_zero1to3.py b/examples/community/pipeline_zero1to3.py
index 8dc6874d2a86..c58d18508196 100644
--- a/examples/community/pipeline_zero1to3.py
+++ b/examples/community/pipeline_zero1to3.py
@@ -35,9 +35,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from diffusers.utils.torch_utils import randn_tensor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/examples/community/run_onnx_controlnet.py b/examples/community/run_onnx_controlnet.py
index a79942b63a59..877138a408d2 100644
--- a/examples/community/run_onnx_controlnet.py
+++ b/examples/community/run_onnx_controlnet.py
@@ -8,6 +8,7 @@
 import numpy as np
 import PIL.Image
 import torch
+from diffuser.utils.torch_utils import randn_tensor
 from PIL import Image
 from transformers import CLIPTokenizer
 
@@ -19,7 +20,6 @@
 from diffusers.utils import (
     deprecate,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
 
diff --git a/examples/community/run_tensorrt_controlnet.py b/examples/community/run_tensorrt_controlnet.py
index a9030663c12f..e3800be542ad 100644
--- a/examples/community/run_tensorrt_controlnet.py
+++ b/examples/community/run_tensorrt_controlnet.py
@@ -11,6 +11,7 @@
 import pycuda.driver as cuda
 import tensorrt as trt
 import torch
+from diffuser.utils.torch_utils import randn_tensor
 from PIL import Image
 from pycuda.tools import make_default_context
 from transformers import CLIPTokenizer
@@ -23,7 +24,6 @@
 from diffusers.utils import (
     deprecate,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
 
diff --git a/examples/community/stable_diffusion_controlnet_img2img.py b/examples/community/stable_diffusion_controlnet_img2img.py
index 200e9a62abb9..71009fb1aa69 100644
--- a/examples/community/stable_diffusion_controlnet_img2img.py
+++ b/examples/community/stable_diffusion_controlnet_img2img.py
@@ -16,9 +16,9 @@
     PIL_INTERPOLATION,
     is_accelerate_available,
     is_accelerate_version,
-    randn_tensor,
     replace_example_docstring,
 )
+from diffusers.utils.torch_utils import randn_tensor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/examples/community/stable_diffusion_controlnet_inpaint.py b/examples/community/stable_diffusion_controlnet_inpaint.py
index 9f36363fb124..3cd9f9f0a258 100644
--- a/examples/community/stable_diffusion_controlnet_inpaint.py
+++ b/examples/community/stable_diffusion_controlnet_inpaint.py
@@ -17,9 +17,9 @@
     PIL_INTERPOLATION,
     is_accelerate_available,
     is_accelerate_version,
-    randn_tensor,
     replace_example_docstring,
 )
+from diffusers.utils.torch_utils import randn_tensor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
index 2f2acebe9aa0..341e89398f7d 100644
--- a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
+++ b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
@@ -16,9 +16,9 @@
     PIL_INTERPOLATION,
     is_accelerate_available,
     is_accelerate_version,
-    randn_tensor,
     replace_example_docstring,
 )
+from diffusers.utils.torch_utils import randn_tensor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/examples/community/stable_diffusion_controlnet_reference.py b/examples/community/stable_diffusion_controlnet_reference.py
index 1503f9f6a883..0814c6b22af9 100644
--- a/examples/community/stable_diffusion_controlnet_reference.py
+++ b/examples/community/stable_diffusion_controlnet_reference.py
@@ -11,7 +11,8 @@
 from diffusers.models.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D
 from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.utils import is_compiled_module, logging, randn_tensor
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/examples/community/stable_diffusion_ipex.py b/examples/community/stable_diffusion_ipex.py
index 146acb773a56..bef575559e07 100644
--- a/examples/community/stable_diffusion_ipex.py
+++ b/examples/community/stable_diffusion_ipex.py
@@ -31,9 +31,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from diffusers.utils.torch_utils import randn_tensor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/examples/community/stable_diffusion_reference.py b/examples/community/stable_diffusion_reference.py
index 68e30f15bce6..3f46e05f653f 100644
--- a/examples/community/stable_diffusion_reference.py
+++ b/examples/community/stable_diffusion_reference.py
@@ -10,7 +10,8 @@
 from diffusers.models.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg
-from diffusers.utils import PIL_INTERPOLATION, logging, randn_tensor
+from diffusers.utils import PIL_INTERPOLATION, logging
+from diffusers.utils.torch_utils import randn_tensor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/examples/community/stable_diffusion_repaint.py b/examples/community/stable_diffusion_repaint.py
index 3fd63d4b213a..dd0c9f683ec6 100644
--- a/examples/community/stable_diffusion_repaint.py
+++ b/examples/community/stable_diffusion_repaint.py
@@ -33,8 +33,8 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
 )
+from diffusers.utils.torch_utils import randn_tensor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/examples/community/stable_diffusion_xl_reference.py b/examples/community/stable_diffusion_xl_reference.py
index b47c962701b6..7549135b220f 100644
--- a/examples/community/stable_diffusion_xl_reference.py
+++ b/examples/community/stable_diffusion_xl_reference.py
@@ -15,7 +15,8 @@
     UpBlock2D,
 )
 from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
-from diffusers.utils import PIL_INTERPOLATION, logging, randn_tensor
+from diffusers.utils import PIL_INTERPOLATION, logging
+from diffusers.utils.torch_utils import randn_tensor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/examples/community/stable_unclip.py b/examples/community/stable_unclip.py
index 1b438c8fcb3e..6acca20d6a78 100644
--- a/examples/community/stable_unclip.py
+++ b/examples/community/stable_unclip.py
@@ -8,7 +8,8 @@
 from diffusers.models import PriorTransformer
 from diffusers.pipelines import DiffusionPipeline, StableDiffusionImageVariationPipeline
 from diffusers.schedulers import UnCLIPScheduler
-from diffusers.utils import logging, randn_tensor
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/examples/community/unclip_image_interpolation.py b/examples/community/unclip_image_interpolation.py
index 618ac25bdc95..98d88bb90c23 100644
--- a/examples/community/unclip_image_interpolation.py
+++ b/examples/community/unclip_image_interpolation.py
@@ -19,7 +19,8 @@
     UNet2DModel,
 )
 from diffusers.pipelines.unclip import UnCLIPTextProjModel
-from diffusers.utils import is_accelerate_available, logging, randn_tensor
+from diffusers.utils import is_accelerate_available, logging
+from diffusers.utils.torch_utils import randn_tensor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/examples/community/unclip_text_interpolation.py b/examples/community/unclip_text_interpolation.py
index 290f45317004..764299433b4c 100644
--- a/examples/community/unclip_text_interpolation.py
+++ b/examples/community/unclip_text_interpolation.py
@@ -15,7 +15,8 @@
     UNet2DModel,
 )
 from diffusers.pipelines.unclip import UnCLIPTextProjModel
-from diffusers.utils import is_accelerate_available, logging, randn_tensor
+from diffusers.utils import is_accelerate_available, logging
+from diffusers.utils.torch_utils import randn_tensor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index d72c671671c1..87feab66503b 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -1,13 +1,12 @@
 __version__ = "0.21.0.dev0"
 
-from .configuration_utils import ConfigMixin
+from typing import TYPE_CHECKING
+
 from .utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
     is_flax_available,
-    is_inflect_available,
-    is_invisible_watermark_available,
     is_k_diffusion_available,
-    is_k_diffusion_version,
     is_librosa_available,
     is_note_seq_available,
     is_onnx_available,
@@ -15,272 +14,364 @@
     is_torch_available,
     is_torchsde_available,
     is_transformers_available,
-    is_transformers_version,
-    is_unidecode_available,
-    logging,
 )
 
 
+# Lazy Import based on
+# https://github.com/huggingface/transformers/blob/main/src/transformers/__init__.py
+
+# When adding a new object to this init, please add it to `_import_structure`. The `_import_structure` is a dictionary submodule to list of object names,
+# and is used to defer the actual importing for when the objects are requested.
+# This way `import diffusers` provides the names in the namespace without actually importing anything (and especially none of the backends).
+
+_import_structure = {
+    "configuration_utils": ["ConfigMixin"],
+    "models": [],
+    "pipelines": [],
+    "schedulers": [],
+    "utils": [
+        "OptionalDependencyNotAvailable",
+        "is_flax_available",
+        "is_inflect_available",
+        "is_invisible_watermark_available",
+        "is_k_diffusion_available",
+        "is_k_diffusion_version",
+        "is_librosa_available",
+        "is_note_seq_available",
+        "is_onnx_available",
+        "is_scipy_available",
+        "is_torch_available",
+        "is_torchsde_available",
+        "is_transformers_available",
+        "is_transformers_version",
+        "is_unidecode_available",
+        "logging",
+    ],
+}
+
 try:
     if not is_onnx_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils.dummy_onnx_objects import *  # noqa F403
+    from .utils import dummy_onnx_objects  # noqa F403
+
+    _import_structure["utils.dummy_onnx_objects"] = [
+        name for name in dir(dummy_onnx_objects) if not name.startswith("_")
+    ]
+
 else:
-    from .pipelines import OnnxRuntimeModel
+    _import_structure["pipelines"].extend(["OnnxRuntimeModel"])
 
 try:
     if not is_torch_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils.dummy_pt_objects import *  # noqa F403
+    from .utils import dummy_pt_objects  # noqa F403
+
+    _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")]
+
 else:
-    from .models import (
-        AsymmetricAutoencoderKL,
-        AutoencoderKL,
-        AutoencoderTiny,
-        ControlNetModel,
-        ModelMixin,
-        MultiAdapter,
-        PriorTransformer,
-        T2IAdapter,
-        T5FilmDecoder,
-        Transformer2DModel,
-        UNet1DModel,
-        UNet2DConditionModel,
-        UNet2DModel,
-        UNet3DConditionModel,
-        VQModel,
-    )
-    from .optimization import (
-        get_constant_schedule,
-        get_constant_schedule_with_warmup,
-        get_cosine_schedule_with_warmup,
-        get_cosine_with_hard_restarts_schedule_with_warmup,
-        get_linear_schedule_with_warmup,
-        get_polynomial_decay_schedule_with_warmup,
-        get_scheduler,
+    _import_structure["models"].extend(
+        [
+            "AsymmetricAutoencoderKL",
+            "AutoencoderKL",
+            "AutoencoderTiny",
+            "ControlNetModel",
+            "ModelMixin",
+            "MultiAdapter",
+            "PriorTransformer",
+            "T2IAdapter",
+            "T5FilmDecoder",
+            "Transformer2DModel",
+            "UNet1DModel",
+            "UNet2DConditionModel",
+            "UNet2DModel",
+            "UNet3DConditionModel",
+            "VQModel",
+        ]
     )
-    from .pipelines import (
-        AudioPipelineOutput,
-        AutoPipelineForImage2Image,
-        AutoPipelineForInpainting,
-        AutoPipelineForText2Image,
-        CLIPImageProjection,
-        ConsistencyModelPipeline,
-        DanceDiffusionPipeline,
-        DDIMPipeline,
-        DDPMPipeline,
-        DiffusionPipeline,
-        DiTPipeline,
-        ImagePipelineOutput,
-        KarrasVePipeline,
-        LDMPipeline,
-        LDMSuperResolutionPipeline,
-        PNDMPipeline,
-        RePaintPipeline,
-        ScoreSdeVePipeline,
+    _import_structure["optimization"] = [
+        "get_constant_schedule",
+        "get_constant_schedule_with_warmup",
+        "get_cosine_schedule_with_warmup",
+        "get_cosine_with_hard_restarts_schedule_with_warmup",
+        "get_linear_schedule_with_warmup",
+        "get_polynomial_decay_schedule_with_warmup",
+        "get_scheduler",
+    ]
+
+    _import_structure["pipelines"].extend(
+        [
+            "AudioPipelineOutput",
+            "AutoPipelineForImage2Image",
+            "AutoPipelineForInpainting",
+            "AutoPipelineForText2Image",
+            "ConsistencyModelPipeline",
+            "DanceDiffusionPipeline",
+            "DDIMPipeline",
+            "DDPMPipeline",
+            "DiffusionPipeline",
+            "DiTPipeline",
+            "ImagePipelineOutput",
+            "KarrasVePipeline",
+            "LDMPipeline",
+            "LDMSuperResolutionPipeline",
+            "PNDMPipeline",
+            "RePaintPipeline",
+            "ScoreSdeVePipeline",
+        ]
     )
-    from .schedulers import (
-        CMStochasticIterativeScheduler,
-        DDIMInverseScheduler,
-        DDIMParallelScheduler,
-        DDIMScheduler,
-        DDPMParallelScheduler,
-        DDPMScheduler,
-        DDPMWuerstchenScheduler,
-        DEISMultistepScheduler,
-        DPMSolverMultistepInverseScheduler,
-        DPMSolverMultistepScheduler,
-        DPMSolverSinglestepScheduler,
-        EulerAncestralDiscreteScheduler,
-        EulerDiscreteScheduler,
-        HeunDiscreteScheduler,
-        IPNDMScheduler,
-        KarrasVeScheduler,
-        KDPM2AncestralDiscreteScheduler,
-        KDPM2DiscreteScheduler,
-        PNDMScheduler,
-        RePaintScheduler,
-        SchedulerMixin,
-        ScoreSdeVeScheduler,
-        UnCLIPScheduler,
-        UniPCMultistepScheduler,
-        VQDiffusionScheduler,
+    _import_structure["schedulers"].extend(
+        [
+            "CMStochasticIterativeScheduler",
+            "DDIMInverseScheduler",
+            "DDIMParallelScheduler",
+            "DDIMScheduler",
+            "DDPMParallelScheduler",
+            "DDPMScheduler",
+            "DDPMWuerstchenScheduler",
+            "DEISMultistepScheduler",
+            "DPMSolverMultistepInverseScheduler",
+            "DPMSolverMultistepScheduler",
+            "DPMSolverSinglestepScheduler",
+            "EulerAncestralDiscreteScheduler",
+            "EulerDiscreteScheduler",
+            "HeunDiscreteScheduler",
+            "IPNDMScheduler",
+            "KarrasVeScheduler",
+            "KDPM2AncestralDiscreteScheduler",
+            "KDPM2DiscreteScheduler",
+            "PNDMScheduler",
+            "RePaintScheduler",
+            "SchedulerMixin",
+            "ScoreSdeVeScheduler",
+            "UnCLIPScheduler",
+            "UniPCMultistepScheduler",
+            "VQDiffusionScheduler",
+        ]
     )
-    from .training_utils import EMAModel
+    _import_structure["training_utils"] = ["EMAModel"]
 
 try:
     if not (is_torch_available() and is_scipy_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_scipy_objects import *  # noqa F403
+    from .utils import dummy_torch_and_scipy_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_scipy_objects"] = [
+        name for name in dir(dummy_torch_and_scipy_objects) if not name.startswith("_")
+    ]
+
 else:
-    from .schedulers import LMSDiscreteScheduler
+    _import_structure["schedulers"].extend(["LMSDiscreteScheduler"])
 
 try:
     if not (is_torch_available() and is_torchsde_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_torchsde_objects import *  # noqa F403
+    from .utils import dummy_torch_and_torchsde_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_torchsde_objects"] = [
+        name for name in dir(dummy_torch_and_torchsde_objects) if not name.startswith("_")
+    ]
+
 else:
-    from .schedulers import DPMSolverSDEScheduler
+    _import_structure["schedulers"].extend(["DPMSolverSDEScheduler"])
 
 try:
     if not (is_torch_available() and is_transformers_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    from .utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_objects) if not name.startswith("_")
+    ]
+
 else:
-    from .pipelines import (
-        AltDiffusionImg2ImgPipeline,
-        AltDiffusionPipeline,
-        AudioLDM2Pipeline,
-        AudioLDM2ProjectionModel,
-        AudioLDM2UNet2DConditionModel,
-        AudioLDMPipeline,
-        CycleDiffusionPipeline,
-        IFImg2ImgPipeline,
-        IFImg2ImgSuperResolutionPipeline,
-        IFInpaintingPipeline,
-        IFInpaintingSuperResolutionPipeline,
-        IFPipeline,
-        IFSuperResolutionPipeline,
-        ImageTextPipelineOutput,
-        KandinskyCombinedPipeline,
-        KandinskyImg2ImgCombinedPipeline,
-        KandinskyImg2ImgPipeline,
-        KandinskyInpaintCombinedPipeline,
-        KandinskyInpaintPipeline,
-        KandinskyPipeline,
-        KandinskyPriorPipeline,
-        KandinskyV22CombinedPipeline,
-        KandinskyV22ControlnetImg2ImgPipeline,
-        KandinskyV22ControlnetPipeline,
-        KandinskyV22Img2ImgCombinedPipeline,
-        KandinskyV22Img2ImgPipeline,
-        KandinskyV22InpaintCombinedPipeline,
-        KandinskyV22InpaintPipeline,
-        KandinskyV22Pipeline,
-        KandinskyV22PriorEmb2EmbPipeline,
-        KandinskyV22PriorPipeline,
-        LDMTextToImagePipeline,
-        MusicLDMPipeline,
-        PaintByExamplePipeline,
-        SemanticStableDiffusionPipeline,
-        ShapEImg2ImgPipeline,
-        ShapEPipeline,
-        StableDiffusionAdapterPipeline,
-        StableDiffusionAttendAndExcitePipeline,
-        StableDiffusionControlNetImg2ImgPipeline,
-        StableDiffusionControlNetInpaintPipeline,
-        StableDiffusionControlNetPipeline,
-        StableDiffusionDepth2ImgPipeline,
-        StableDiffusionDiffEditPipeline,
-        StableDiffusionGLIGENPipeline,
-        StableDiffusionGLIGENTextImagePipeline,
-        StableDiffusionImageVariationPipeline,
-        StableDiffusionImg2ImgPipeline,
-        StableDiffusionInpaintPipeline,
-        StableDiffusionInpaintPipelineLegacy,
-        StableDiffusionInstructPix2PixPipeline,
-        StableDiffusionLatentUpscalePipeline,
-        StableDiffusionLDM3DPipeline,
-        StableDiffusionModelEditingPipeline,
-        StableDiffusionPanoramaPipeline,
-        StableDiffusionParadigmsPipeline,
-        StableDiffusionPipeline,
-        StableDiffusionPipelineSafe,
-        StableDiffusionPix2PixZeroPipeline,
-        StableDiffusionSAGPipeline,
-        StableDiffusionUpscalePipeline,
-        StableDiffusionXLAdapterPipeline,
-        StableDiffusionXLControlNetImg2ImgPipeline,
-        StableDiffusionXLControlNetInpaintPipeline,
-        StableDiffusionXLControlNetPipeline,
-        StableDiffusionXLImg2ImgPipeline,
-        StableDiffusionXLInpaintPipeline,
-        StableDiffusionXLInstructPix2PixPipeline,
-        StableDiffusionXLPipeline,
-        StableUnCLIPImg2ImgPipeline,
-        StableUnCLIPPipeline,
-        TextToVideoSDPipeline,
-        TextToVideoZeroPipeline,
-        UnCLIPImageVariationPipeline,
-        UnCLIPPipeline,
-        UniDiffuserModel,
-        UniDiffuserPipeline,
-        UniDiffuserTextDecoder,
-        VersatileDiffusionDualGuidedPipeline,
-        VersatileDiffusionImageVariationPipeline,
-        VersatileDiffusionPipeline,
-        VersatileDiffusionTextToImagePipeline,
-        VideoToVideoSDPipeline,
-        VQDiffusionPipeline,
-        WuerstchenCombinedPipeline,
-        WuerstchenDecoderPipeline,
-        WuerstchenPriorPipeline,
+    _import_structure["pipelines"].extend(
+        [
+            "AltDiffusionImg2ImgPipeline",
+            "AltDiffusionPipeline",
+            "AudioLDM2Pipeline",
+            "AudioLDM2ProjectionModel",
+            "AudioLDM2UNet2DConditionModel",
+            "AudioLDMPipeline",
+            "CycleDiffusionPipeline",
+            "IFImg2ImgPipeline",
+            "IFImg2ImgSuperResolutionPipeline",
+            "IFInpaintingPipeline",
+            "IFInpaintingSuperResolutionPipeline",
+            "IFPipeline",
+            "IFSuperResolutionPipeline",
+            "ImageTextPipelineOutput",
+            "KandinskyCombinedPipeline",
+            "KandinskyImg2ImgCombinedPipeline",
+            "KandinskyImg2ImgPipeline",
+            "KandinskyInpaintCombinedPipeline",
+            "KandinskyInpaintPipeline",
+            "KandinskyPipeline",
+            "KandinskyPriorPipeline",
+            "KandinskyV22CombinedPipeline",
+            "KandinskyV22ControlnetImg2ImgPipeline",
+            "KandinskyV22ControlnetPipeline",
+            "KandinskyV22Img2ImgCombinedPipeline",
+            "KandinskyV22Img2ImgPipeline",
+            "KandinskyV22InpaintCombinedPipeline",
+            "KandinskyV22InpaintPipeline",
+            "KandinskyV22Pipeline",
+            "KandinskyV22PriorEmb2EmbPipeline",
+            "KandinskyV22PriorPipeline",
+            "LDMTextToImagePipeline",
+            "MusicLDMPipeline",
+            "PaintByExamplePipeline",
+            "SemanticStableDiffusionPipeline",
+            "ShapEImg2ImgPipeline",
+            "ShapEPipeline",
+            "StableDiffusionAdapterPipeline",
+            "StableDiffusionAttendAndExcitePipeline",
+            "StableDiffusionControlNetImg2ImgPipeline",
+            "StableDiffusionControlNetInpaintPipeline",
+            "StableDiffusionControlNetPipeline",
+            "StableDiffusionDepth2ImgPipeline",
+            "StableDiffusionDiffEditPipeline",
+            "StableDiffusionGLIGENPipeline",
+            "StableDiffusionGLIGENPipeline",
+            "StableDiffusionGLIGENTextImagePipeline",
+            "StableDiffusionImageVariationPipeline",
+            "StableDiffusionImg2ImgPipeline",
+            "StableDiffusionInpaintPipeline",
+            "StableDiffusionInpaintPipelineLegacy",
+            "StableDiffusionInstructPix2PixPipeline",
+            "StableDiffusionLatentUpscalePipeline",
+            "StableDiffusionLDM3DPipeline",
+            "StableDiffusionModelEditingPipeline",
+            "StableDiffusionPanoramaPipeline",
+            "StableDiffusionParadigmsPipeline",
+            "StableDiffusionPipeline",
+            "StableDiffusionPipelineSafe",
+            "StableDiffusionPix2PixZeroPipeline",
+            "StableDiffusionSAGPipeline",
+            "StableDiffusionUpscalePipeline",
+            "StableDiffusionXLAdapterPipeline",
+            "StableDiffusionXLControlNetImg2ImgPipeline",
+            "StableDiffusionXLControlNetInpaintPipeline",
+            "StableDiffusionXLControlNetPipeline",
+            "StableDiffusionXLImg2ImgPipeline",
+            "StableDiffusionXLInpaintPipeline",
+            "StableDiffusionXLInstructPix2PixPipeline",
+            "StableDiffusionXLPipeline",
+            "StableUnCLIPImg2ImgPipeline",
+            "StableUnCLIPPipeline",
+            "TextToVideoSDPipeline",
+            "TextToVideoZeroPipeline",
+            "UnCLIPImageVariationPipeline",
+            "UnCLIPPipeline",
+            "UniDiffuserModel",
+            "UniDiffuserPipeline",
+            "UniDiffuserTextDecoder",
+            "VersatileDiffusionDualGuidedPipeline",
+            "VersatileDiffusionImageVariationPipeline",
+            "VersatileDiffusionPipeline",
+            "VersatileDiffusionTextToImagePipeline",
+            "VideoToVideoSDPipeline",
+            "VQDiffusionPipeline",
+            "WuerstchenCombinedPipeline",
+            "WuerstchenDecoderPipeline",
+            "WuerstchenPriorPipeline",
+        ]
     )
 
 try:
     if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_transformers_and_k_diffusion_objects import *  # noqa F403
+    from .utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_and_k_diffusion_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_and_k_diffusion_objects) if not name.startswith("_")
+    ]
+
 else:
-    from .pipelines import StableDiffusionKDiffusionPipeline
+    _import_structure["pipelines"].extend(["StableDiffusionKDiffusionPipeline"])
 
 try:
     if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_transformers_and_onnx_objects import *  # noqa F403
+    from .utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_and_onnx_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_and_onnx_objects) if not name.startswith("_")
+    ]
+
 else:
-    from .pipelines import (
-        OnnxStableDiffusionImg2ImgPipeline,
-        OnnxStableDiffusionInpaintPipeline,
-        OnnxStableDiffusionInpaintPipelineLegacy,
-        OnnxStableDiffusionPipeline,
-        OnnxStableDiffusionUpscalePipeline,
-        StableDiffusionOnnxPipeline,
+    _import_structure["pipelines"].extend(
+        [
+            "OnnxStableDiffusionImg2ImgPipeline",
+            "OnnxStableDiffusionInpaintPipeline",
+            "OnnxStableDiffusionInpaintPipelineLegacy",
+            "OnnxStableDiffusionPipeline",
+            "OnnxStableDiffusionUpscalePipeline",
+            "StableDiffusionOnnxPipeline",
+        ]
     )
 
 try:
     if not (is_torch_available() and is_librosa_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_librosa_objects import *  # noqa F403
+    from .utils import dummy_torch_and_librosa_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_librosa_objects"] = [
+        name for name in dir(dummy_torch_and_librosa_objects) if not name.startswith("_")
+    ]
+
 else:
-    from .pipelines import AudioDiffusionPipeline, Mel
+    _import_structure["pipelines"].extend(["AudioDiffusionPipeline", "Mel"])
 
 try:
     if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+    from .utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
+
+    _import_structure["utils.dummy_transformers_and_torch_and_note_seq_objects"] = [
+        name for name in dir(dummy_transformers_and_torch_and_note_seq_objects) if not name.startswith("_")
+    ]
+
+
 else:
-    from .pipelines import SpectrogramDiffusionPipeline
+    _import_structure["pipelines"].extend(["SpectrogramDiffusionPipeline"])
 
 try:
     if not is_flax_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils.dummy_flax_objects import *  # noqa F403
+    from .utils import dummy_flax_objects  # noqa F403
+
+    _import_structure["utils.dummy_flax_objects"] = [
+        name for name in dir(dummy_flax_objects) if not name.startswith("_")
+    ]
+
+
 else:
-    from .models.controlnet_flax import FlaxControlNetModel
-    from .models.modeling_flax_utils import FlaxModelMixin
-    from .models.unet_2d_condition_flax import FlaxUNet2DConditionModel
-    from .models.vae_flax import FlaxAutoencoderKL
-    from .pipelines import FlaxDiffusionPipeline
-    from .schedulers import (
-        FlaxDDIMScheduler,
-        FlaxDDPMScheduler,
-        FlaxDPMSolverMultistepScheduler,
-        FlaxKarrasVeScheduler,
-        FlaxLMSDiscreteScheduler,
-        FlaxPNDMScheduler,
-        FlaxSchedulerMixin,
-        FlaxScoreSdeVeScheduler,
+    _import_structure["models.controlnet_flax"] = ["FlaxControlNetModel"]
+    _import_structure["models.modeling_flax_utils"] = ["FlaxModelMixin"]
+    _import_structure["models.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
+    _import_structure["models.vae_flax"] = ["FlaxAutoencoderKL"]
+    _import_structure["pipelines"].extend(["FlaxDiffusionPipeline"])
+    _import_structure["schedulers"].extend(
+        [
+            "FlaxDDIMScheduler",
+            "FlaxDDPMScheduler",
+            "FlaxDPMSolverMultistepScheduler",
+            "FlaxKarrasVeScheduler",
+            "FlaxLMSDiscreteScheduler",
+            "FlaxPNDMScheduler",
+            "FlaxSchedulerMixin",
+            "FlaxScoreSdeVeScheduler",
+        ]
     )
 
 
@@ -288,19 +379,330 @@
     if not (is_flax_available() and is_transformers_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils.dummy_flax_and_transformers_objects import *  # noqa F403
+    from .utils import dummy_flax_and_transformers_objects  # noqa F403
+
+    _import_structure["utils.dummy_flax_and_transformers_objects"] = [
+        name for name in dir(dummy_flax_and_transformers_objects) if not name.startswith("_")
+    ]
+
+
 else:
-    from .pipelines import (
-        FlaxStableDiffusionControlNetPipeline,
-        FlaxStableDiffusionImg2ImgPipeline,
-        FlaxStableDiffusionInpaintPipeline,
-        FlaxStableDiffusionPipeline,
+    _import_structure["pipelines"].extend(
+        [
+            "FlaxStableDiffusionControlNetPipeline",
+            "FlaxStableDiffusionImg2ImgPipeline",
+            "FlaxStableDiffusionInpaintPipeline",
+            "FlaxStableDiffusionPipeline",
+        ]
     )
 
 try:
     if not (is_note_seq_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils.dummy_note_seq_objects import *  # noqa F403
+    from .utils import dummy_note_seq_objects  # noqa F403
+
+    _import_structure["utils.dummy_note_seq_objects"] = [
+        name for name in dir(dummy_note_seq_objects) if not name.startswith("_")
+    ]
+
+
 else:
-    from .pipelines import MidiProcessor
+    _import_structure["pipelines"].extend(["MidiProcessor"])
+
+if TYPE_CHECKING:
+    from .configuration_utils import ConfigMixin
+
+    try:
+        if not is_onnx_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_onnx_objects import *  # noqa F403
+    else:
+        from .pipelines import OnnxRuntimeModel
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_pt_objects import *  # noqa F403
+    else:
+        from .models import (
+            AsymmetricAutoencoderKL,
+            AutoencoderKL,
+            AutoencoderTiny,
+            ControlNetModel,
+            ModelMixin,
+            MultiAdapter,
+            PriorTransformer,
+            T2IAdapter,
+            T5FilmDecoder,
+            Transformer2DModel,
+            UNet1DModel,
+            UNet2DConditionModel,
+            UNet2DModel,
+            UNet3DConditionModel,
+            VQModel,
+        )
+        from .optimization import (
+            get_constant_schedule,
+            get_constant_schedule_with_warmup,
+            get_cosine_schedule_with_warmup,
+            get_cosine_with_hard_restarts_schedule_with_warmup,
+            get_linear_schedule_with_warmup,
+            get_polynomial_decay_schedule_with_warmup,
+            get_scheduler,
+        )
+        from .pipelines import (
+            AudioPipelineOutput,
+            AutoPipelineForImage2Image,
+            AutoPipelineForInpainting,
+            AutoPipelineForText2Image,
+            CLIPImageProjection,
+            ConsistencyModelPipeline,
+            DanceDiffusionPipeline,
+            DDIMPipeline,
+            DDPMPipeline,
+            DiffusionPipeline,
+            DiTPipeline,
+            ImagePipelineOutput,
+            KarrasVePipeline,
+            LDMPipeline,
+            LDMSuperResolutionPipeline,
+            PNDMPipeline,
+            RePaintPipeline,
+            ScoreSdeVePipeline,
+        )
+        from .schedulers import (
+            CMStochasticIterativeScheduler,
+            DDIMInverseScheduler,
+            DDIMParallelScheduler,
+            DDIMScheduler,
+            DDPMParallelScheduler,
+            DDPMScheduler,
+            DDPMWuerstchenScheduler,
+            DEISMultistepScheduler,
+            DPMSolverMultistepInverseScheduler,
+            DPMSolverMultistepScheduler,
+            DPMSolverSinglestepScheduler,
+            EulerAncestralDiscreteScheduler,
+            EulerDiscreteScheduler,
+            HeunDiscreteScheduler,
+            IPNDMScheduler,
+            KarrasVeScheduler,
+            KDPM2AncestralDiscreteScheduler,
+            KDPM2DiscreteScheduler,
+            PNDMScheduler,
+            RePaintScheduler,
+            SchedulerMixin,
+            ScoreSdeVeScheduler,
+            UnCLIPScheduler,
+            UniPCMultistepScheduler,
+            VQDiffusionScheduler,
+        )
+        from .training_utils import EMAModel
+
+    try:
+        if not (is_torch_available() and is_scipy_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_scipy_objects import *  # noqa F403
+    else:
+        from .schedulers import LMSDiscreteScheduler
+
+    try:
+        if not (is_torch_available() and is_torchsde_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_torchsde_objects import *  # noqa F403
+    else:
+        from .schedulers import DPMSolverSDEScheduler
+
+    try:
+        if not (is_torch_available() and is_transformers_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipelines import (
+            AltDiffusionImg2ImgPipeline,
+            AltDiffusionPipeline,
+            AudioLDM2Pipeline,
+            AudioLDM2ProjectionModel,
+            AudioLDM2UNet2DConditionModel,
+            AudioLDMPipeline,
+            CycleDiffusionPipeline,
+            IFImg2ImgPipeline,
+            IFImg2ImgSuperResolutionPipeline,
+            IFInpaintingPipeline,
+            IFInpaintingSuperResolutionPipeline,
+            IFPipeline,
+            IFSuperResolutionPipeline,
+            ImageTextPipelineOutput,
+            KandinskyCombinedPipeline,
+            KandinskyImg2ImgCombinedPipeline,
+            KandinskyImg2ImgPipeline,
+            KandinskyInpaintCombinedPipeline,
+            KandinskyInpaintPipeline,
+            KandinskyPipeline,
+            KandinskyPriorPipeline,
+            KandinskyV22CombinedPipeline,
+            KandinskyV22ControlnetImg2ImgPipeline,
+            KandinskyV22ControlnetPipeline,
+            KandinskyV22Img2ImgCombinedPipeline,
+            KandinskyV22Img2ImgPipeline,
+            KandinskyV22InpaintCombinedPipeline,
+            KandinskyV22InpaintPipeline,
+            KandinskyV22Pipeline,
+            KandinskyV22PriorEmb2EmbPipeline,
+            KandinskyV22PriorPipeline,
+            LDMTextToImagePipeline,
+            MusicLDMPipeline,
+            PaintByExamplePipeline,
+            SemanticStableDiffusionPipeline,
+            ShapEImg2ImgPipeline,
+            ShapEPipeline,
+            StableDiffusionAdapterPipeline,
+            StableDiffusionAttendAndExcitePipeline,
+            StableDiffusionControlNetImg2ImgPipeline,
+            StableDiffusionControlNetInpaintPipeline,
+            StableDiffusionControlNetPipeline,
+            StableDiffusionDepth2ImgPipeline,
+            StableDiffusionDiffEditPipeline,
+            StableDiffusionGLIGENPipeline,
+            StableDiffusionGLIGENTextImagePipeline,
+            StableDiffusionImageVariationPipeline,
+            StableDiffusionImg2ImgPipeline,
+            StableDiffusionInpaintPipeline,
+            StableDiffusionInpaintPipelineLegacy,
+            StableDiffusionInstructPix2PixPipeline,
+            StableDiffusionLatentUpscalePipeline,
+            StableDiffusionLDM3DPipeline,
+            StableDiffusionModelEditingPipeline,
+            StableDiffusionPanoramaPipeline,
+            StableDiffusionParadigmsPipeline,
+            StableDiffusionPipeline,
+            StableDiffusionPipelineSafe,
+            StableDiffusionPix2PixZeroPipeline,
+            StableDiffusionSAGPipeline,
+            StableDiffusionUpscalePipeline,
+            StableDiffusionXLAdapterPipeline,
+            StableDiffusionXLControlNetImg2ImgPipeline,
+            StableDiffusionXLControlNetInpaintPipeline,
+            StableDiffusionXLControlNetPipeline,
+            StableDiffusionXLImg2ImgPipeline,
+            StableDiffusionXLInpaintPipeline,
+            StableDiffusionXLInstructPix2PixPipeline,
+            StableDiffusionXLPipeline,
+            StableUnCLIPImg2ImgPipeline,
+            StableUnCLIPPipeline,
+            TextToVideoSDPipeline,
+            TextToVideoZeroPipeline,
+            UnCLIPImageVariationPipeline,
+            UnCLIPPipeline,
+            UniDiffuserModel,
+            UniDiffuserPipeline,
+            UniDiffuserTextDecoder,
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+            VideoToVideoSDPipeline,
+            VQDiffusionPipeline,
+            WuerstchenCombinedPipeline,
+            WuerstchenDecoderPipeline,
+            WuerstchenPriorPipeline,
+        )
+
+    try:
+        if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_and_k_diffusion_objects import *  # noqa F403
+    else:
+        from .pipelines import StableDiffusionKDiffusionPipeline
+
+    try:
+        if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_and_onnx_objects import *  # noqa F403
+    else:
+        from .pipelines import (
+            OnnxStableDiffusionImg2ImgPipeline,
+            OnnxStableDiffusionInpaintPipeline,
+            OnnxStableDiffusionInpaintPipelineLegacy,
+            OnnxStableDiffusionPipeline,
+            OnnxStableDiffusionUpscalePipeline,
+            StableDiffusionOnnxPipeline,
+        )
+
+    try:
+        if not (is_torch_available() and is_librosa_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_librosa_objects import *  # noqa F403
+    else:
+        from .pipelines import AudioDiffusionPipeline, Mel
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+    else:
+        from .pipelines import SpectrogramDiffusionPipeline
+
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_flax_objects import *  # noqa F403
+    else:
+        from .models.controlnet_flax import FlaxControlNetModel
+        from .models.modeling_flax_utils import FlaxModelMixin
+        from .models.unet_2d_condition_flax import FlaxUNet2DConditionModel
+        from .models.vae_flax import FlaxAutoencoderKL
+        from .pipelines import FlaxDiffusionPipeline
+        from .schedulers import (
+            FlaxDDIMScheduler,
+            FlaxDDPMScheduler,
+            FlaxDPMSolverMultistepScheduler,
+            FlaxKarrasVeScheduler,
+            FlaxLMSDiscreteScheduler,
+            FlaxPNDMScheduler,
+            FlaxSchedulerMixin,
+            FlaxScoreSdeVeScheduler,
+        )
+
+    try:
+        if not (is_flax_available() and is_transformers_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_flax_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipelines import (
+            FlaxStableDiffusionControlNetPipeline,
+            FlaxStableDiffusionImg2ImgPipeline,
+            FlaxStableDiffusionInpaintPipeline,
+            FlaxStableDiffusionPipeline,
+        )
+
+    try:
+        if not (is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_note_seq_objects import *  # noqa F403
+    else:
+        from .pipelines import MidiProcessor
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={"__version__": __version__},
+    )
diff --git a/src/diffusers/experimental/rl/value_guided_sampling.py b/src/diffusers/experimental/rl/value_guided_sampling.py
index e58952aa207f..262039be4fdb 100644
--- a/src/diffusers/experimental/rl/value_guided_sampling.py
+++ b/src/diffusers/experimental/rl/value_guided_sampling.py
@@ -18,8 +18,8 @@
 
 from ...models.unet_1d import UNet1DModel
 from ...pipelines import DiffusionPipeline
-from ...utils import randn_tensor
 from ...utils.dummy_pt_objects import DDPMScheduler
+from ...utils.torch_utils import randn_tensor
 
 
 class ValueGuidedRLPipeline(DiffusionPipeline):
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 54e77df0ff72..fc60ff845ccf 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -12,27 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..utils import is_flax_available, is_torch_available
+from ..utils import _LazyModule, is_flax_available, is_torch_available
 
 
+_import_structure = {}
+
 if is_torch_available():
-    from .adapter import MultiAdapter, T2IAdapter
-    from .autoencoder_asym_kl import AsymmetricAutoencoderKL
-    from .autoencoder_kl import AutoencoderKL
-    from .autoencoder_tiny import AutoencoderTiny
-    from .controlnet import ControlNetModel
-    from .dual_transformer_2d import DualTransformer2DModel
-    from .modeling_utils import ModelMixin
-    from .prior_transformer import PriorTransformer
-    from .t5_film_transformer import T5FilmDecoder
-    from .transformer_2d import Transformer2DModel
-    from .unet_1d import UNet1DModel
-    from .unet_2d import UNet2DModel
-    from .unet_2d_condition import UNet2DConditionModel
-    from .unet_3d_condition import UNet3DConditionModel
-    from .vq_model import VQModel
+    _import_structure["adapter"] = ["MultiAdapter", "T2IAdapter"]
+    _import_structure["autoencoder_asym_kl"] = ["AsymmetricAutoencoderKL"]
+    _import_structure["autoencoder_kl"] = ["AutoencoderKL"]
+    _import_structure["autoencoder_tiny"] = ["AutoencoderTiny"]
+    _import_structure["controlnet"] = ["ControlNetModel"]
+    _import_structure["dual_transformer_2d"] = ["DualTransformer2DModel"]
+    _import_structure["modeling_utils"] = ["ModelMixin"]
+    _import_structure["prior_transformer"] = ["PriorTransformer"]
+    _import_structure["t5_film_transformer"] = ["T5FilmDecoder"]
+    _import_structure["transformer_2d"] = ["Transformer2DModel"]
+    _import_structure["transformer_temporal"] = ["TransformerTemporalModel"]
+    _import_structure["unet_1d"] = ["UNet1DModel"]
+    _import_structure["unet_2d"] = ["UNet2DModel"]
+    _import_structure["unet_2d_condition"] = ["UNet2DConditionModel"]
+    _import_structure["unet_3d_condition"] = ["UNet3DConditionModel"]
+    _import_structure["vq_model"] = ["VQModel"]
 
 if is_flax_available():
-    from .controlnet_flax import FlaxControlNetModel
-    from .unet_2d_condition_flax import FlaxUNet2DConditionModel
-    from .vae_flax import FlaxAutoencoderKL
+    _import_structure["controlnet_flax"] = ["FlaxControlNetModel"]
+    _import_structure["unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
+    _import_structure["vae_flax"] = ["FlaxAutoencoderKL"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 185b87f2046a..892d44a03137 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -17,7 +17,7 @@
 import torch.nn.functional as F
 from torch import nn
 
-from ..utils import maybe_allow_in_graph
+from ..utils.torch_utils import maybe_allow_in_graph
 from .activations import get_activation
 from .attention_processor import Attention
 from .embeddings import CombinedTimestepLabelEmbeddings
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 49fc2c638620..c8e7dc66802c 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -18,8 +18,9 @@
 import torch.nn.functional as F
 from torch import nn
 
-from ..utils import deprecate, logging, maybe_allow_in_graph
+from ..utils import deprecate, logging
 from ..utils.import_utils import is_xformers_available
+from ..utils.torch_utils import maybe_allow_in_graph
 from .lora import LoRACompatibleLinear, LoRALinearLayer
 
 
diff --git a/src/diffusers/models/autoencoder_asym_kl.py b/src/diffusers/models/autoencoder_asym_kl.py
index e286cb215dbf..d8099120918b 100644
--- a/src/diffusers/models/autoencoder_asym_kl.py
+++ b/src/diffusers/models/autoencoder_asym_kl.py
@@ -17,7 +17,7 @@
 import torch.nn as nn
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import apply_forward_hook
+from ..utils.accelerate_utils import apply_forward_hook
 from .autoencoder_kl import AutoencoderKLOutput
 from .modeling_utils import ModelMixin
 from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder, MaskConditionDecoder
diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py
index 72157e5827b4..76666a4cc295 100644
--- a/src/diffusers/models/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoder_kl.py
@@ -19,7 +19,8 @@
 
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import FromOriginalVAEMixin
-from ..utils import BaseOutput, apply_forward_hook
+from ..utils import BaseOutput
+from ..utils.accelerate_utils import apply_forward_hook
 from .attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
diff --git a/src/diffusers/models/autoencoder_tiny.py b/src/diffusers/models/autoencoder_tiny.py
index ad36b7a2ce66..407b1906bba4 100644
--- a/src/diffusers/models/autoencoder_tiny.py
+++ b/src/diffusers/models/autoencoder_tiny.py
@@ -19,7 +19,8 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, apply_forward_hook
+from ..utils import BaseOutput
+from ..utils.accelerate_utils import apply_forward_hook
 from .modeling_utils import ModelMixin
 from .vae import DecoderOutput, DecoderTiny, EncoderTiny
 
diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index 220c0ce990c8..36983eefc01f 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -18,7 +18,8 @@
 import torch
 import torch.nn as nn
 
-from ..utils import BaseOutput, is_torch_version, randn_tensor
+from ..utils import BaseOutput, is_torch_version
+from ..utils.torch_utils import randn_tensor
 from .activations import get_activation
 from .attention_processor import SpatialNorm
 from .unet_2d_blocks import AutoencoderTinyBlock, UNetMidBlock2D, get_down_block, get_up_block
diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py
index 393a638d483b..0c15300af213 100644
--- a/src/diffusers/models/vq_model.py
+++ b/src/diffusers/models/vq_model.py
@@ -18,7 +18,8 @@
 import torch.nn as nn
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, apply_forward_hook
+from ..utils import BaseOutput
+from ..utils.accelerate_utils import apply_forward_hook
 from .modeling_utils import ModelMixin
 from .vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
 
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 28f42ce9fae9..b237adae7d54 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -1,5 +1,7 @@
 from ..utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
     is_flax_available,
     is_k_diffusion_available,
     is_librosa_available,
@@ -10,187 +12,256 @@
 )
 
 
+# These modules contain pipelines from multiple libraries/frameworks
+_import_structure = {"stable_diffusion": [], "latent_diffusion": [], "controlnet": []}
+_dummy_objects = {}
+
 try:
     if not is_torch_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_pt_objects import *  # noqa F403
+    from ..utils import dummy_pt_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_pt_objects))
+
 else:
-    from .auto_pipeline import AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image
-    from .consistency_models import ConsistencyModelPipeline
-    from .dance_diffusion import DanceDiffusionPipeline
-    from .ddim import DDIMPipeline
-    from .ddpm import DDPMPipeline
-    from .dit import DiTPipeline
-    from .latent_diffusion import LDMSuperResolutionPipeline
-    from .latent_diffusion_uncond import LDMPipeline
-    from .pipeline_utils import AudioPipelineOutput, DiffusionPipeline, ImagePipelineOutput
-    from .pndm import PNDMPipeline
-    from .repaint import RePaintPipeline
-    from .score_sde_ve import ScoreSdeVePipeline
-    from .stochastic_karras_ve import KarrasVePipeline
+    _import_structure["auto_pipeline"] = [
+        "AutoPipelineForImage2Image",
+        "AutoPipelineForInpainting",
+        "AutoPipelineForText2Image",
+    ]
+    _import_structure["consistency_models"] = ["ConsistencyModelPipeline"]
+    _import_structure["dance_diffusion"] = ["DanceDiffusionPipeline"]
+    _import_structure["ddim"] = ["DDIMPipeline"]
+    _import_structure["ddpm"] = ["DDPMPipeline"]
+    _import_structure["dit"] = ["DiTPipeline"]
+    _import_structure["latent_diffusion"].extend(["LDMSuperResolutionPipeline"])
+    _import_structure["latent_diffusion_uncond"] = ["LDMPipeline"]
+    _import_structure["pipeline_utils"] = ["AudioPipelineOutput", "DiffusionPipeline", "ImagePipelineOutput"]
+    _import_structure["pndm"] = ["PNDMPipeline"]
+    _import_structure["repaint"] = ["RePaintPipeline"]
+    _import_structure["score_sde_ve"] = ["ScoreSdeVePipeline"]
+    _import_structure["stochastic_karras_ve"] = ["KarrasVePipeline"]
 
 try:
     if not (is_torch_available() and is_librosa_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_torch_and_librosa_objects import *  # noqa F403
+    from ..utils import dummy_torch_and_librosa_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_librosa_objects))
+
 else:
-    from .audio_diffusion import AudioDiffusionPipeline, Mel
+    _import_structure["audio_diffusion"] = ["AudioDiffusionPipeline", "Mel"]
 
 try:
     if not (is_torch_available() and is_transformers_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    from ..utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
 else:
-    from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline
-    from .audioldm import AudioLDMPipeline
-    from .audioldm2 import AudioLDM2Pipeline, AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
-    from .controlnet import (
-        StableDiffusionControlNetImg2ImgPipeline,
-        StableDiffusionControlNetInpaintPipeline,
-        StableDiffusionControlNetPipeline,
-        StableDiffusionXLControlNetImg2ImgPipeline,
-        StableDiffusionXLControlNetInpaintPipeline,
-        StableDiffusionXLControlNetPipeline,
-    )
-    from .deepfloyd_if import (
-        IFImg2ImgPipeline,
-        IFImg2ImgSuperResolutionPipeline,
-        IFInpaintingPipeline,
-        IFInpaintingSuperResolutionPipeline,
-        IFPipeline,
-        IFSuperResolutionPipeline,
+    _import_structure["alt_diffusion"] = ["AltDiffusionImg2ImgPipeline", "AltDiffusionPipeline"]
+    _import_structure["audioldm"] = ["AudioLDMPipeline"]
+    _import_structure["audioldm2"] = ["AudioLDM2Pipeline", "AudioLDM2ProjectionModel", "AudioLDM2UNet2DConditionModel"]
+    _import_structure["controlnet"].extend(
+        [
+            "StableDiffusionControlNetImg2ImgPipeline",
+            "StableDiffusionControlNetInpaintPipeline",
+            "StableDiffusionControlNetPipeline",
+            "StableDiffusionXLControlNetImg2ImgPipeline",
+            "StableDiffusionXLControlNetInpaintPipeline",
+            "StableDiffusionXLControlNetPipeline",
+        ]
     )
-    from .kandinsky import (
-        KandinskyCombinedPipeline,
-        KandinskyImg2ImgCombinedPipeline,
-        KandinskyImg2ImgPipeline,
-        KandinskyInpaintCombinedPipeline,
-        KandinskyInpaintPipeline,
-        KandinskyPipeline,
-        KandinskyPriorPipeline,
+    _import_structure["deepfloyd_if"] = [
+        "IFImg2ImgPipeline",
+        "IFImg2ImgSuperResolutionPipeline",
+        "IFInpaintingPipeline",
+        "IFInpaintingSuperResolutionPipeline",
+        "IFPipeline",
+        "IFSuperResolutionPipeline",
+    ]
+    _import_structure["kandinsky"] = [
+        "KandinskyCombinedPipeline",
+        "KandinskyImg2ImgCombinedPipeline",
+        "KandinskyImg2ImgPipeline",
+        "KandinskyInpaintCombinedPipeline",
+        "KandinskyInpaintPipeline",
+        "KandinskyPipeline",
+        "KandinskyPriorPipeline",
+    ]
+    _import_structure["kandinsky2_2"] = [
+        "KandinskyV22CombinedPipeline",
+        "KandinskyV22ControlnetImg2ImgPipeline",
+        "KandinskyV22ControlnetPipeline",
+        "KandinskyV22Img2ImgCombinedPipeline",
+        "KandinskyV22Img2ImgPipeline",
+        "KandinskyV22InpaintCombinedPipeline",
+        "KandinskyV22InpaintPipeline",
+        "KandinskyV22Pipeline",
+        "KandinskyV22PriorEmb2EmbPipeline",
+        "KandinskyV22PriorPipeline",
+    ]
+    _import_structure["latent_diffusion"].extend(["LDMTextToImagePipeline"])
+    _import_structure["musicldm"] = ["MusicLDMPipeline"]
+    _import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
+    _import_structure["semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
+    _import_structure["shap_e"] = ["ShapEImg2ImgPipeline", "ShapEPipeline"]
+    _import_structure["stable_diffusion"].extend(
+        [
+            "CycleDiffusionPipeline",
+            "StableDiffusionAttendAndExcitePipeline",
+            "StableDiffusionDepth2ImgPipeline",
+            "StableDiffusionDiffEditPipeline",
+            "StableDiffusionGLIGENPipeline",
+            "StableDiffusionImageVariationPipeline",
+            "StableDiffusionImg2ImgPipeline",
+            "StableDiffusionInpaintPipeline",
+            "StableDiffusionInpaintPipelineLegacy",
+            "StableDiffusionInstructPix2PixPipeline",
+            "StableDiffusionLatentUpscalePipeline",
+            "StableDiffusionLDM3DPipeline",
+            "StableDiffusionModelEditingPipeline",
+            "StableDiffusionPanoramaPipeline",
+            "StableDiffusionParadigmsPipeline",
+            "StableDiffusionPipeline",
+            "StableDiffusionPix2PixZeroPipeline",
+            "StableDiffusionSAGPipeline",
+            "StableDiffusionUpscalePipeline",
+            "StableUnCLIPImg2ImgPipeline",
+            "StableUnCLIPPipeline",
+            "StableDiffusionGLIGENTextImagePipeline",
+            "StableDiffusionGLIGENPipeline",
+        ]
     )
-    from .kandinsky2_2 import (
-        KandinskyV22CombinedPipeline,
-        KandinskyV22ControlnetImg2ImgPipeline,
-        KandinskyV22ControlnetPipeline,
-        KandinskyV22Img2ImgCombinedPipeline,
-        KandinskyV22Img2ImgPipeline,
-        KandinskyV22InpaintCombinedPipeline,
-        KandinskyV22InpaintPipeline,
-        KandinskyV22Pipeline,
-        KandinskyV22PriorEmb2EmbPipeline,
-        KandinskyV22PriorPipeline,
-    )
-    from .latent_diffusion import LDMTextToImagePipeline
-    from .musicldm import MusicLDMPipeline
-    from .paint_by_example import PaintByExamplePipeline
-    from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
-    from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
-    from .stable_diffusion import (
-        CycleDiffusionPipeline,
-        StableDiffusionAttendAndExcitePipeline,
-        StableDiffusionDepth2ImgPipeline,
-        StableDiffusionDiffEditPipeline,
-        StableDiffusionGLIGENPipeline,
-        StableDiffusionGLIGENTextImagePipeline,
-        StableDiffusionImageVariationPipeline,
-        StableDiffusionImg2ImgPipeline,
-        StableDiffusionInpaintPipeline,
-        StableDiffusionInpaintPipelineLegacy,
-        StableDiffusionInstructPix2PixPipeline,
-        StableDiffusionLatentUpscalePipeline,
-        StableDiffusionLDM3DPipeline,
-        StableDiffusionModelEditingPipeline,
-        StableDiffusionPanoramaPipeline,
-        StableDiffusionParadigmsPipeline,
-        StableDiffusionPipeline,
-        StableDiffusionPix2PixZeroPipeline,
-        StableDiffusionSAGPipeline,
-        StableDiffusionUpscalePipeline,
-        StableUnCLIPImg2ImgPipeline,
-        StableUnCLIPPipeline,
-    )
-    from .stable_diffusion.clip_image_project_model import CLIPImageProjection
-    from .stable_diffusion_safe import StableDiffusionPipelineSafe
-    from .stable_diffusion_xl import (
-        StableDiffusionXLImg2ImgPipeline,
-        StableDiffusionXLInpaintPipeline,
-        StableDiffusionXLInstructPix2PixPipeline,
-        StableDiffusionXLPipeline,
-    )
-    from .t2i_adapter import StableDiffusionAdapterPipeline, StableDiffusionXLAdapterPipeline
-    from .text_to_video_synthesis import TextToVideoSDPipeline, TextToVideoZeroPipeline, VideoToVideoSDPipeline
-    from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
-    from .unidiffuser import ImageTextPipelineOutput, UniDiffuserModel, UniDiffuserPipeline, UniDiffuserTextDecoder
-    from .versatile_diffusion import (
-        VersatileDiffusionDualGuidedPipeline,
-        VersatileDiffusionImageVariationPipeline,
-        VersatileDiffusionPipeline,
-        VersatileDiffusionTextToImagePipeline,
-    )
-    from .vq_diffusion import VQDiffusionPipeline
-    from .wuerstchen import WuerstchenCombinedPipeline, WuerstchenDecoderPipeline, WuerstchenPriorPipeline
+    _import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
+    _import_structure["stable_diffusion_xl"] = [
+        "StableDiffusionXLImg2ImgPipeline",
+        "StableDiffusionXLInpaintPipeline",
+        "StableDiffusionXLInstructPix2PixPipeline",
+        "StableDiffusionXLPipeline",
+    ]
+    _import_structure["t2i_adapter"] = ["StableDiffusionAdapterPipeline", "StableDiffusionXLAdapterPipeline"]
+    _import_structure["text_to_video_synthesis"] = [
+        "TextToVideoSDPipeline",
+        "TextToVideoZeroPipeline",
+        "VideoToVideoSDPipeline",
+    ]
+    _import_structure["unclip"] = ["UnCLIPImageVariationPipeline", "UnCLIPPipeline"]
+    _import_structure["unidiffuser"] = [
+        "ImageTextPipelineOutput",
+        "UniDiffuserModel",
+        "UniDiffuserPipeline",
+        "UniDiffuserTextDecoder",
+    ]
+    _import_structure["versatile_diffusion"] = [
+        "VersatileDiffusionDualGuidedPipeline",
+        "VersatileDiffusionImageVariationPipeline",
+        "VersatileDiffusionPipeline",
+        "VersatileDiffusionTextToImagePipeline",
+    ]
+    _import_structure["vq_diffusion"] = ["VQDiffusionPipeline"]
+    _import_structure["wuerstchen"] = [
+        "WuerstchenCombinedPipeline",
+        "WuerstchenDecoderPipeline",
+        "WuerstchenPriorPipeline",
+    ]
 
 
 try:
     if not is_onnx_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_onnx_objects import *  # noqa F403
+    from ..utils import dummy_onnx_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_onnx_objects))
+
 else:
-    from .onnx_utils import OnnxRuntimeModel
+    _import_structure["onnx_utils"] = ["OnnxRuntimeModel"]
 
 try:
     if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_torch_and_transformers_and_onnx_objects import *  # noqa F403
+    from ..utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_onnx_objects))
+
 else:
-    from .stable_diffusion import (
-        OnnxStableDiffusionImg2ImgPipeline,
-        OnnxStableDiffusionInpaintPipeline,
-        OnnxStableDiffusionInpaintPipelineLegacy,
-        OnnxStableDiffusionPipeline,
-        OnnxStableDiffusionUpscalePipeline,
-        StableDiffusionOnnxPipeline,
+    _import_structure["stable_diffusion"].extend(
+        [
+            "OnnxStableDiffusionImg2ImgPipeline",
+            "OnnxStableDiffusionInpaintPipeline",
+            "OnnxStableDiffusionInpaintPipelineLegacy",
+            "OnnxStableDiffusionPipeline",
+            "OnnxStableDiffusionUpscalePipeline",
+            "StableDiffusionOnnxPipeline",
+        ]
     )
 
 try:
     if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_torch_and_transformers_and_k_diffusion_objects import *  # noqa F403
+    from ..utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
+
 else:
-    from .stable_diffusion import StableDiffusionKDiffusionPipeline
+    _import_structure["stable_diffusion"].extend(["StableDiffusionKDiffusionPipeline"])
 
 try:
     if not is_flax_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_flax_objects import *  # noqa F403
+    from ..utils import dummy_flax_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_flax_objects))
+
 else:
-    from .pipeline_flax_utils import FlaxDiffusionPipeline
+    _import_structure["pipeline_flax_utils"] = ["FlaxDiffusionPipeline"]
 
 
 try:
     if not (is_flax_available() and is_transformers_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_flax_and_transformers_objects import *  # noqa F403
+    from ..utils import dummy_flax_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
+
 else:
-    from .controlnet import FlaxStableDiffusionControlNetPipeline
-    from .stable_diffusion import (
-        FlaxStableDiffusionImg2ImgPipeline,
-        FlaxStableDiffusionInpaintPipeline,
-        FlaxStableDiffusionPipeline,
+    _import_structure["controlnet"].extend(["FlaxStableDiffusionControlNetPipeline"])
+    _import_structure["stable_diffusion"].extend(
+        [
+            "FlaxStableDiffusionImg2ImgPipeline",
+            "FlaxStableDiffusionInpaintPipeline",
+            "FlaxStableDiffusionPipeline",
+        ]
     )
 try:
     if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+    from ..utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
+
 else:
-    from .spectrogram_diffusion import MidiProcessor, SpectrogramDiffusionPipeline
+    _import_structure["spectrogram_diffusion"] = ["MidiProcessor", "SpectrogramDiffusionPipeline"]
+
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/alt_diffusion/__init__.py b/src/diffusers/pipelines/alt_diffusion/__init__.py
index 03c9f5ebc63e..c2e4db7eab1c 100644
--- a/src/diffusers/pipelines/alt_diffusion/__init__.py
+++ b/src/diffusers/pipelines/alt_diffusion/__init__.py
@@ -1,38 +1,37 @@
-from dataclasses import dataclass
-from typing import List, Optional, Union
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
 
-import numpy as np
-import PIL
-from PIL import Image
-
-from ...utils import BaseOutput, OptionalDependencyNotAvailable, is_torch_available, is_transformers_available
-
-
-@dataclass
-# Copied from diffusers.pipelines.stable_diffusion.__init__.StableDiffusionPipelineOutput with Stable->Alt
-class AltDiffusionPipelineOutput(BaseOutput):
-    """
-    Output class for Alt Diffusion pipelines.
-
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
-            num_channels)`.
-        nsfw_content_detected (`List[bool]`)
-            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
-            `None` if safety checking could not be performed.
-    """
-
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_content_detected: Optional[List[bool]]
 
+_import_structure = {}
+_dummy_objects = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import ShapEPipeline
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
 else:
-    from .modeling_roberta_series import RobertaSeriesModelWithTransformation
-    from .pipeline_alt_diffusion import AltDiffusionPipeline
-    from .pipeline_alt_diffusion_img2img import AltDiffusionImg2ImgPipeline
+    _import_structure["pipeline_output"] = ["AltDiffusionPipelineOutput"]
+    _import_structure["modeling_roberta_series"] = ["RobertaSeriesModelWithTransformation"]
+    _import_structure["pipeline_alt_diffusion"] = ["AltDiffusionPipeline"]
+    _import_structure["pipeline_alt_diffusion_img2img"] = ["AltDiffusionImg2ImgPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index 78e46990b50c..7af8027ed763 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -27,7 +27,8 @@
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
+from ...utils import deprecate, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 5713395639cc..a7219446d273 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -29,7 +29,8 @@
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor, replace_example_docstring
+from ...utils import PIL_INTERPOLATION, deprecate, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_output.py b/src/diffusers/pipelines/alt_diffusion/pipeline_output.py
new file mode 100644
index 000000000000..220c7f358402
--- /dev/null
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_output.py
@@ -0,0 +1,28 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+
+from ...utils import (
+    BaseOutput,
+)
+
+
+@dataclass
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_output.StableDiffusionPipelineOutput with Stable->Alt
+class AltDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for Alt Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
diff --git a/src/diffusers/pipelines/audio_diffusion/__init__.py b/src/diffusers/pipelines/audio_diffusion/__init__.py
index 58554c45ea52..578a94693382 100644
--- a/src/diffusers/pipelines/audio_diffusion/__init__.py
+++ b/src/diffusers/pipelines/audio_diffusion/__init__.py
@@ -1,2 +1,18 @@
-from .mel import Mel
-from .pipeline_audio_diffusion import AudioDiffusionPipeline
+from ...utils import _LazyModule
+
+
+_import_structure = {}
+_dummy_objects = {}
+
+_import_structure["mel"] = ["Mel"]
+_import_structure["pipeline_audio_diffusion"] = ["AudioDiffusionPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py b/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
index 74737560cd8e..a06217c19bf7 100644
--- a/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
+++ b/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
@@ -22,7 +22,7 @@
 
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import DDIMScheduler, DDPMScheduler
-from ...utils import randn_tensor
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import AudioPipelineOutput, BaseOutput, DiffusionPipeline, ImagePipelineOutput
 from .mel import Mel
 
diff --git a/src/diffusers/pipelines/audioldm/__init__.py b/src/diffusers/pipelines/audioldm/__init__.py
index 8ddef6c3f325..2acd5c25ed75 100644
--- a/src/diffusers/pipelines/audioldm/__init__.py
+++ b/src/diffusers/pipelines/audioldm/__init__.py
@@ -1,11 +1,16 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
     is_torch_available,
     is_transformers_available,
     is_transformers_version,
 )
 
 
+_import_structure = {}
+_dummy_objects = {}
+
+
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
         raise OptionalDependencyNotAvailable()
@@ -13,5 +18,21 @@
     from ...utils.dummy_torch_and_transformers_objects import (
         AudioLDMPipeline,
     )
+
+    _dummy_objects.update({"AudioLDMPipeline": AudioLDMPipeline})
+
 else:
-    from .pipeline_audioldm import AudioLDMPipeline
+    _import_structure["pipeline_audioldm"] = ["AudioLDMPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index f577f51dd5ab..c95e45000133 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -22,7 +22,8 @@
 
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging, randn_tensor, replace_example_docstring
+from ...utils import logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 
 
diff --git a/src/diffusers/pipelines/audioldm2/__init__.py b/src/diffusers/pipelines/audioldm2/__init__.py
index 3917a6eb2116..67001f8e44ca 100644
--- a/src/diffusers/pipelines/audioldm2/__init__.py
+++ b/src/diffusers/pipelines/audioldm2/__init__.py
@@ -1,20 +1,34 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
     is_torch_available,
     is_transformers_available,
     is_transformers_version,
 )
 
 
+_import_structure = {}
+_dummy_objects = {}
+
+
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import (
-        AudioLDM2Pipeline,
-        AudioLDM2ProjectionModel,
-        AudioLDM2UNet2DConditionModel,
-    )
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
 else:
-    from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
-    from .pipeline_audioldm2 import AudioLDM2Pipeline
+    _import_structure["modeling_audioldm2"] = ["AudioLDM2ProjectionModel", "AudioLDM2UNet2DConditionModel"]
+    _import_structure["pipeline_audioldm2"] = ["AudioLDM2Pipeline"]
+
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index 224b2a731b38..e5e03036caec 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -36,9 +36,9 @@
     is_accelerate_version,
     is_librosa_available,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
 
diff --git a/src/diffusers/pipelines/consistency_models/__init__.py b/src/diffusers/pipelines/consistency_models/__init__.py
index fd78ddb3aae2..d1d2ab59500b 100644
--- a/src/diffusers/pipelines/consistency_models/__init__.py
+++ b/src/diffusers/pipelines/consistency_models/__init__.py
@@ -1 +1,17 @@
-from .pipeline_consistency_models import ConsistencyModelPipeline
+from ...utils import (
+    _LazyModule,
+)
+
+
+_import_structure = {}
+_import_structure["pipeline_consistency_models"] = ["ConsistencyModelPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
index 83cb37dc1e35..511c767aeaf4 100644
--- a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
+++ b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
@@ -8,9 +8,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/controlnet/__init__.py b/src/diffusers/pipelines/controlnet/__init__.py
index 0cd7b69fe618..60b3fa0b7539 100644
--- a/src/diffusers/pipelines/controlnet/__init__.py
+++ b/src/diffusers/pipelines/controlnet/__init__.py
@@ -1,25 +1,57 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
     is_flax_available,
     is_torch_available,
     is_transformers_available,
 )
 
 
+_import_structure = {}
+_dummy_objects = {}
+
+
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
+
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
+
 else:
-    from .multicontrolnet import MultiControlNetModel
-    from .pipeline_controlnet import StableDiffusionControlNetPipeline
-    from .pipeline_controlnet_img2img import StableDiffusionControlNetImg2ImgPipeline
-    from .pipeline_controlnet_inpaint import StableDiffusionControlNetInpaintPipeline
-    from .pipeline_controlnet_inpaint_sd_xl import StableDiffusionXLControlNetInpaintPipeline
-    from .pipeline_controlnet_sd_xl import StableDiffusionXLControlNetPipeline
-    from .pipeline_controlnet_sd_xl_img2img import StableDiffusionXLControlNetImg2ImgPipeline
+    _import_structure["multicontrolnet"] = ["MultiControlNetModel"]
+    _import_structure["pipeline_controlnet"] = ["StableDiffusionControlNetPipeline"]
+    _import_structure["pipeline_controlnet_img2img"] = ["StableDiffusionControlNetImg2ImgPipeline"]
+    _import_structure["pipeline_controlnet_inpaint"] = ["StableDiffusionControlNetInpaintPipeline"]
+    _import_structure["pipeline_controlnet_sd_xl"] = ["StableDiffusionXLControlNetPipeline"]
+    _import_structure["pipeline_controlnet_sd_xl_img2img"] = ["StableDiffusionXLControlNetImg2ImgPipeline"]
+    _import_structure["pipeline_controlnet_inpaint_sd_xl"] = ["StableDiffusionXLControlNetInpaintPipeline"]
+
+try:
+    if not (is_transformers_available() and is_flax_available()):
+        raise OptionalDependencyNotAvailable()
 
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_flax_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
+
+else:
+    _import_structure["pipeline_flax_controlnet"] = ["FlaxStableDiffusionControlNetPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
 
-if is_transformers_available() and is_flax_available():
-    from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
index 82e3851377d9..bb569249e5f5 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -31,11 +31,10 @@
     deprecate,
     is_accelerate_available,
     is_accelerate_version,
-    is_compiled_module,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion import StableDiffusionPipelineOutput
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
index 88410ad0d7c3..7a173d98d279 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -30,11 +30,10 @@
     deprecate,
     is_accelerate_available,
     is_accelerate_version,
-    is_compiled_module,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion import StableDiffusionPipelineOutput
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index f98e4bb20c3c..c933bf9ccee5 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -32,11 +32,10 @@
     deprecate,
     is_accelerate_available,
     is_accelerate_version,
-    is_compiled_module,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion import StableDiffusionPipelineOutput
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
index c64204501b97..9d0dd462ba7e 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -38,12 +38,11 @@
 from ...utils import (
     is_accelerate_available,
     is_accelerate_version,
-    is_compiled_module,
     is_invisible_watermark_available,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from .multicontrolnet import MultiControlNetModel
 
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
index ef6b54e81548..50e13b76d664 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -39,11 +39,10 @@
 from ...utils import (
     is_accelerate_available,
     is_accelerate_version,
-    is_compiled_module,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion_xl import StableDiffusionXLPipelineOutput
 
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
index 02f3d8e4b36d..ca3bc8ca7754 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -38,11 +38,10 @@
 from ...utils import (
     is_accelerate_available,
     is_accelerate_version,
-    is_compiled_module,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion_xl import StableDiffusionXLPipelineOutput
 
diff --git a/src/diffusers/pipelines/dance_diffusion/__init__.py b/src/diffusers/pipelines/dance_diffusion/__init__.py
index 55d7f8ff9807..39f213b35a04 100644
--- a/src/diffusers/pipelines/dance_diffusion/__init__.py
+++ b/src/diffusers/pipelines/dance_diffusion/__init__.py
@@ -1 +1,16 @@
-from .pipeline_dance_diffusion import DanceDiffusionPipeline
+from ...utils import _LazyModule
+
+
+_import_structure = {}
+_import_structure["pipeline_dance_diffusion"] = ["DanceDiffusionPipeline"]
+
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
index b2d46c6f90f1..77c57a1425d3 100644
--- a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
+++ b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -17,7 +17,8 @@
 
 import torch
 
-from ...utils import logging, randn_tensor
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 
 
diff --git a/src/diffusers/pipelines/ddim/__init__.py b/src/diffusers/pipelines/ddim/__init__.py
index 85e8118e75e7..1715a2b6acbb 100644
--- a/src/diffusers/pipelines/ddim/__init__.py
+++ b/src/diffusers/pipelines/ddim/__init__.py
@@ -1 +1,15 @@
-from .pipeline_ddim import DDIMPipeline
+from ...utils import _LazyModule
+
+
+_import_structure = {}
+_import_structure["pipeline_ddim"] = ["DDIMPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/ddim/pipeline_ddim.py b/src/diffusers/pipelines/ddim/pipeline_ddim.py
index 6eae78f2801e..dcb326ede058 100644
--- a/src/diffusers/pipelines/ddim/pipeline_ddim.py
+++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -17,7 +17,7 @@
 import torch
 
 from ...schedulers import DDIMScheduler
-from ...utils import randn_tensor
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/ddpm/__init__.py b/src/diffusers/pipelines/ddpm/__init__.py
index bb228ee012e8..a3936af03a6a 100644
--- a/src/diffusers/pipelines/ddpm/__init__.py
+++ b/src/diffusers/pipelines/ddpm/__init__.py
@@ -1 +1,17 @@
-from .pipeline_ddpm import DDPMPipeline
+from ...utils import (
+    _LazyModule,
+)
+
+
+_import_structure = {}
+_import_structure["pipeline_ddpm"] = ["DDPMPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index 1e9ead0f3d39..d34bea7f9cf0 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -17,7 +17,7 @@
 
 import torch
 
-from ...utils import randn_tensor
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/deepfloyd_if/__init__.py b/src/diffusers/pipelines/deepfloyd_if/__init__.py
index 93414f20e733..a6d58cab9c81 100644
--- a/src/diffusers/pipelines/deepfloyd_if/__init__.py
+++ b/src/diffusers/pipelines/deepfloyd_if/__init__.py
@@ -1,54 +1,55 @@
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL
-
-from ...utils import BaseOutput, OptionalDependencyNotAvailable, is_torch_available, is_transformers_available
-from .timesteps import (
-    fast27_timesteps,
-    smart27_timesteps,
-    smart50_timesteps,
-    smart100_timesteps,
-    smart185_timesteps,
-    super27_timesteps,
-    super40_timesteps,
-    super100_timesteps,
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
 )
 
 
-@dataclass
-class IFPipelineOutput(BaseOutput):
-    """
-    Args:
-    Output class for Stable Diffusion pipelines.
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-        nsfw_detected (`List[bool]`)
-            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content or a watermark. `None` if safety checking could not be performed.
-        watermark_detected (`List[bool]`)
-            List of flags denoting whether the corresponding generated image likely has a watermark. `None` if safety
-            checking could not be performed.
-    """
-
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_detected: Optional[List[bool]]
-    watermark_detected: Optional[List[bool]]
+_import_structure = {}
+_dummy_objects = {}
 
+_import_structure["timesteps"] = [
+    "fast27_timesteps",
+    "smart27_timesteps",
+    "smart50_timesteps",
+    "smart100_timesteps",
+    "smart185_timesteps",
+    "super27_timesteps",
+    "super40_timesteps",
+    "super100_timesteps",
+]
 
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
 else:
-    from .pipeline_if import IFPipeline
-    from .pipeline_if_img2img import IFImg2ImgPipeline
-    from .pipeline_if_img2img_superresolution import IFImg2ImgSuperResolutionPipeline
-    from .pipeline_if_inpainting import IFInpaintingPipeline
-    from .pipeline_if_inpainting_superresolution import IFInpaintingSuperResolutionPipeline
-    from .pipeline_if_superresolution import IFSuperResolutionPipeline
-    from .safety_checker import IFSafetyChecker
-    from .watermark import IFWatermarker
+    _import_structure["pipeline_output"] = ["IFPipelineOutput"]
+    _import_structure["pipeline_if"] = ["IFPipeline"]
+    _import_structure["pipeline_if_img2img"] = ["IFImg2ImgPipeline"]
+    _import_structure["pipeline_if_img2img_superresolution"] = ["IFImg2ImgSuperResolutionPipeline"]
+    _import_structure["pipeline_if_inpainting"] = ["IFInpaintingPipeline"]
+    _import_structure["pipeline_if_inpainting_superresolution"] = ["IFInpaintingSuperResolutionPipeline"]
+    _import_structure["pipeline_if_superresolution"] = ["IFSuperResolutionPipeline"]
+    _import_structure["safety_checker"] = ["IFSafetyChecker"]
+    _import_structure["watermark"] = ["IFWatermarker"]
+
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
index 50939644ebd7..0f4e702268d4 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
@@ -17,9 +17,9 @@
     is_bs4_available,
     is_ftfy_available,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
index afd8f691ea68..e14133f0e481 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
@@ -20,9 +20,9 @@
     is_bs4_available,
     is_ftfy_available,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
index d00a19c92421..20ac5a90e2cc 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
@@ -21,9 +21,9 @@
     is_bs4_available,
     is_ftfy_available,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
index a15341e26b69..d54c9aedc6a5 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
@@ -20,9 +20,9 @@
     is_bs4_available,
     is_ftfy_available,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
index e523e6d332dc..1217d2d8398f 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
@@ -21,9 +21,9 @@
     is_bs4_available,
     is_ftfy_available,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
index eafdd6f0d28a..8e1a6338eaed 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -20,9 +20,9 @@
     is_bs4_available,
     is_ftfy_available,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
new file mode 100644
index 000000000000..f33c4b9e46dd
--- /dev/null
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
@@ -0,0 +1,28 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class IFPipelineOutput(BaseOutput):
+    """
+    Args:
+    Output class for Stable Diffusion pipelines.
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+        nsfw_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content or a watermark. `None` if safety checking could not be performed.
+        watermark_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely has a watermark. `None` if safety
+            checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_detected: Optional[List[bool]]
+    watermark_detected: Optional[List[bool]]
diff --git a/src/diffusers/pipelines/dit/__init__.py b/src/diffusers/pipelines/dit/__init__.py
index 4ef0729cb490..be3c74454393 100644
--- a/src/diffusers/pipelines/dit/__init__.py
+++ b/src/diffusers/pipelines/dit/__init__.py
@@ -1 +1,15 @@
-from .pipeline_dit import DiTPipeline
+from ...utils import _LazyModule
+
+
+_import_structure = {}
+_import_structure["pipeline_dit"] = ["DiTPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/dit/pipeline_dit.py b/src/diffusers/pipelines/dit/pipeline_dit.py
index d57f13c2991a..5f5b0b199168 100644
--- a/src/diffusers/pipelines/dit/pipeline_dit.py
+++ b/src/diffusers/pipelines/dit/pipeline_dit.py
@@ -24,7 +24,7 @@
 
 from ...models import AutoencoderKL, Transformer2DModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import randn_tensor
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index 946d31649018..cc4580721eff 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -1,23 +1,46 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
     is_torch_available,
     is_transformers_available,
 )
 
 
+_import_structure = {}
+_dummy_objects = {}
+
+
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import *
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
 else:
-    from .pipeline_kandinsky import KandinskyPipeline
-    from .pipeline_kandinsky_combined import (
-        KandinskyCombinedPipeline,
-        KandinskyImg2ImgCombinedPipeline,
-        KandinskyInpaintCombinedPipeline,
-    )
-    from .pipeline_kandinsky_img2img import KandinskyImg2ImgPipeline
-    from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
-    from .pipeline_kandinsky_prior import KandinskyPriorPipeline, KandinskyPriorPipelineOutput
-    from .text_encoder import MultilingualCLIP
+    _import_structure["pipeline_kandinsky"] = ["KandinskyPipeline"]
+    _import_structure["pipeline_kandinsky_combined"] = [
+        "KandinskyCombinedPipeline",
+        "KandinskyImg2ImgCombinedPipeline",
+        "KandinskyInpaintCombinedPipeline",
+    ]
+    _import_structure["pipeline_kandinsky_img2img"] = ["KandinskyImg2ImgPipeline"]
+    _import_structure["pipeline_kandinsky_inpaint"] = ["KandinskyInpaintPipeline"]
+    _import_structure["pipeline_kandinsky_prior"] = ["KandinskyPriorPipeline", "KandinskyPriorPipelineOutput"]
+    _import_structure["text_encoder"] = ["MultilingualCLIP"]
+
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 89afa0060ef8..8545b8b42ff0 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -25,9 +25,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .text_encoder import MultilingualCLIP
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 5673d306aa0c..5013203049a1 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -28,9 +28,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .text_encoder import MultilingualCLIP
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index dda0c3faa7fd..4a920b5c3262 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -32,9 +32,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .text_encoder import MultilingualCLIP
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 57d8c7beb97a..b6c031feac29 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -27,9 +27,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/__init__.py b/src/diffusers/pipelines/kandinsky2_2/__init__.py
index 4997a2e4056b..639d6ad977c2 100644
--- a/src/diffusers/pipelines/kandinsky2_2/__init__.py
+++ b/src/diffusers/pipelines/kandinsky2_2/__init__.py
@@ -1,25 +1,48 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
     is_torch_available,
     is_transformers_available,
 )
 
 
+_import_structure = {}
+_dummy_objects = {}
+
+
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import *
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
 else:
-    from .pipeline_kandinsky2_2 import KandinskyV22Pipeline
-    from .pipeline_kandinsky2_2_combined import (
-        KandinskyV22CombinedPipeline,
-        KandinskyV22Img2ImgCombinedPipeline,
-        KandinskyV22InpaintCombinedPipeline,
-    )
-    from .pipeline_kandinsky2_2_controlnet import KandinskyV22ControlnetPipeline
-    from .pipeline_kandinsky2_2_controlnet_img2img import KandinskyV22ControlnetImg2ImgPipeline
-    from .pipeline_kandinsky2_2_img2img import KandinskyV22Img2ImgPipeline
-    from .pipeline_kandinsky2_2_inpainting import KandinskyV22InpaintPipeline
-    from .pipeline_kandinsky2_2_prior import KandinskyV22PriorPipeline
-    from .pipeline_kandinsky2_2_prior_emb2emb import KandinskyV22PriorEmb2EmbPipeline
+    _import_structure["pipeline_kandinsky2_2"] = ["KandinskyV22Pipeline"]
+    _import_structure["pipeline_kandinsky2_2_combined"] = [
+        "KandinskyV22CombinedPipeline",
+        "KandinskyV22Img2ImgCombinedPipeline",
+        "KandinskyV22InpaintCombinedPipeline",
+    ]
+    _import_structure["pipeline_kandinsky2_2_controlnet"] = ["KandinskyV22ControlnetPipeline"]
+    _import_structure["pipeline_kandinsky2_2_controlnet_img2img"] = ["KandinskyV22ControlnetImg2ImgPipeline"]
+    _import_structure["pipeline_kandinsky2_2_img2img"] = ["KandinskyV22Img2ImgPipeline"]
+    _import_structure["pipeline_kandinsky2_2_inpainting"] = ["KandinskyV22InpaintPipeline"]
+    _import_structure["pipeline_kandinsky2_2_prior"] = ["KandinskyV22PriorPipeline"]
+    _import_structure["pipeline_kandinsky2_2_prior_emb2emb"] = ["KandinskyV22PriorEmb2EmbPipeline"]
+
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
index ccbdae09dc08..2ff2d8b004ab 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
@@ -22,9 +22,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
index 22b3eaf0915e..ec82f4516042 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
@@ -22,9 +22,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
index 1b3328faaf97..8a2deb52fbce 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
@@ -25,9 +25,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
index 82e609ce7cd1..9b0f576fa7d0 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
@@ -25,9 +25,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
index 2e0a0d833740..7320a62ef6e0 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
@@ -29,9 +29,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
index 3cf33b563145..943363dc7795 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -10,9 +10,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..kandinsky import KandinskyPriorPipelineOutput
 from ..pipeline_utils import DiffusionPipeline
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index 75e1644f6186..f17f463b9bfe 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -10,9 +10,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..kandinsky import KandinskyPriorPipelineOutput
 from ..pipeline_utils import DiffusionPipeline
 
diff --git a/src/diffusers/pipelines/latent_diffusion/__init__.py b/src/diffusers/pipelines/latent_diffusion/__init__.py
index a6c16f598695..a78e6622bcfe 100644
--- a/src/diffusers/pipelines/latent_diffusion/__init__.py
+++ b/src/diffusers/pipelines/latent_diffusion/__init__.py
@@ -1,11 +1,37 @@
-from ...utils import OptionalDependencyNotAvailable, is_torch_available, is_transformers_available
-from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_import_structure = {}
+_dummy_objects = {}
 
 
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import ShapEPipeline
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    from .pipeline_latent_diffusion import LDMBertModel, LDMTextToImagePipeline
+    _import_structure["pipeline_latent_diffusion"] = ["LDMBertModel", "LDMTextToImagePipeline"]
+    _import_structure["pipeline_latent_diffusion_superresolution"] = ["LDMSuperResolutionPipeline"]
+
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
index e86f7b985e47..4b4315a421e8 100644
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -25,7 +25,7 @@
 
 from ...models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
 from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ...utils import randn_tensor
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
index c8d5c1a1891d..def1183abc9e 100644
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
@@ -15,7 +15,8 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ...utils import PIL_INTERPOLATION, randn_tensor
+from ...utils import PIL_INTERPOLATION
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/latent_diffusion_uncond/__init__.py b/src/diffusers/pipelines/latent_diffusion_uncond/__init__.py
index 1b9fc5270a62..73e5c703f61a 100644
--- a/src/diffusers/pipelines/latent_diffusion_uncond/__init__.py
+++ b/src/diffusers/pipelines/latent_diffusion_uncond/__init__.py
@@ -1 +1,15 @@
-from .pipeline_latent_diffusion_uncond import LDMPipeline
+from ...utils import _LazyModule
+
+
+_import_structure = {}
+_import_structure["pipeline_latent_diffusion_uncond"] = ["LDMPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
index be130a74c28c..f3638eee86fc 100644
--- a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
+++ b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
@@ -19,7 +19,7 @@
 
 from ...models import UNet2DModel, VQModel
 from ...schedulers import DDIMScheduler
-from ...utils import randn_tensor
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/musicldm/__init__.py b/src/diffusers/pipelines/musicldm/__init__.py
index b82f429798e7..6228f763a53b 100644
--- a/src/diffusers/pipelines/musicldm/__init__.py
+++ b/src/diffusers/pipelines/musicldm/__init__.py
@@ -1,17 +1,36 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
     is_torch_available,
     is_transformers_available,
     is_transformers_version,
 )
 
 
+_import_structure = {}
+_dummy_objects = {}
+
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import (
-        MusicLDMPipeline,
-    )
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
 else:
-    from .pipeline_musicldm import MusicLDMPipeline
+    _import_structure["pipeline_musicldm"] = ["MusicLDMPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
index 802de432e1c0..a891099f1aac 100644
--- a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
+++ b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
@@ -28,7 +28,8 @@
 
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import is_librosa_available, logging, randn_tensor, replace_example_docstring
+from ...utils import is_librosa_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 
 
diff --git a/src/diffusers/pipelines/paint_by_example/__init__.py b/src/diffusers/pipelines/paint_by_example/__init__.py
index 9d3ce86531ee..c19ce1036e3f 100644
--- a/src/diffusers/pipelines/paint_by_example/__init__.py
+++ b/src/diffusers/pipelines/paint_by_example/__init__.py
@@ -5,14 +5,38 @@
 import PIL
 from PIL import Image
 
-from ...utils import OptionalDependencyNotAvailable, is_torch_available, is_transformers_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
 
 
+_import_structure = {}
+_dummy_objects = {}
+
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import ShapEPipeline
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    from .image_encoder import PaintByExampleImageEncoder
-    from .pipeline_paint_by_example import PaintByExamplePipeline
+    _import_structure["image_encoder"] = ["PaintByExampleImageEncoder"]
+    _import_structure["pipeline_paint_by_example"] = ["PaintByExamplePipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
index a0e0f9f6d624..383edae08e8f 100644
--- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
+++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
@@ -23,7 +23,8 @@
 from ...image_processor import VaeImageProcessor
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ...utils import deprecate, logging, randn_tensor
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion import StableDiffusionPipelineOutput
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 110c97acdcdf..fb120ebc7d3b 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -51,12 +51,12 @@
     get_class_from_dynamic_module,
     is_accelerate_available,
     is_accelerate_version,
-    is_compiled_module,
     is_torch_version,
     is_transformers_available,
     logging,
     numpy_to_pil,
 )
+from ..utils.torch_utils import is_compiled_module
 
 
 if is_transformers_available():
diff --git a/src/diffusers/pipelines/pndm/__init__.py b/src/diffusers/pipelines/pndm/__init__.py
index 488eb4f5f2b2..7374016c32d9 100644
--- a/src/diffusers/pipelines/pndm/__init__.py
+++ b/src/diffusers/pipelines/pndm/__init__.py
@@ -1 +1,16 @@
-from .pipeline_pndm import PNDMPipeline
+from ...utils import _LazyModule
+
+
+_import_structure = {}
+_import_structure["pipeline_pndm"] = ["PNDMPipeline"]
+
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/pndm/pipeline_pndm.py b/src/diffusers/pipelines/pndm/pipeline_pndm.py
index 4add91fd1a69..78690997223a 100644
--- a/src/diffusers/pipelines/pndm/pipeline_pndm.py
+++ b/src/diffusers/pipelines/pndm/pipeline_pndm.py
@@ -19,7 +19,7 @@
 
 from ...models import UNet2DModel
 from ...schedulers import PNDMScheduler
-from ...utils import randn_tensor
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/repaint/__init__.py b/src/diffusers/pipelines/repaint/__init__.py
index 16bc86d1cedf..2a0eedf30bbf 100644
--- a/src/diffusers/pipelines/repaint/__init__.py
+++ b/src/diffusers/pipelines/repaint/__init__.py
@@ -1 +1,15 @@
-from .pipeline_repaint import RePaintPipeline
+from ...utils import _LazyModule
+
+
+_import_structure = {}
+_import_structure["pipeline_repaint"] = ["RePaintPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/repaint/pipeline_repaint.py b/src/diffusers/pipelines/repaint/pipeline_repaint.py
index 398a50cf5e25..5372c2431d52 100644
--- a/src/diffusers/pipelines/repaint/pipeline_repaint.py
+++ b/src/diffusers/pipelines/repaint/pipeline_repaint.py
@@ -21,7 +21,8 @@
 
 from ...models import UNet2DModel
 from ...schedulers import RePaintScheduler
-from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
+from ...utils import PIL_INTERPOLATION, deprecate, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/score_sde_ve/__init__.py b/src/diffusers/pipelines/score_sde_ve/__init__.py
index c7c2a85c067b..2cd7ac2bf440 100644
--- a/src/diffusers/pipelines/score_sde_ve/__init__.py
+++ b/src/diffusers/pipelines/score_sde_ve/__init__.py
@@ -1 +1,15 @@
-from .pipeline_score_sde_ve import ScoreSdeVePipeline
+from ...utils import _LazyModule
+
+
+_import_structure = {}
+_import_structure["pipeline_score_sde_ve"] = ["ScoreSdeVePipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py b/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
index ace4f0c60db8..eb98479b9b61 100644
--- a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
+++ b/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
@@ -18,7 +18,7 @@
 
 from ...models import UNet2DModel
 from ...schedulers import ScoreSdeVeScheduler
-from ...utils import randn_tensor
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py b/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
index 95d3604bcf09..1b743ac3d58d 100644
--- a/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
@@ -1,36 +1,38 @@
-from dataclasses import dataclass
-from enum import Enum
-from typing import List, Optional, Union
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
 
-import numpy as np
-import PIL
-from PIL import Image
 
-from ...utils import BaseOutput, OptionalDependencyNotAvailable, is_torch_available, is_transformers_available
-
-
-@dataclass
-class SemanticStableDiffusionPipelineOutput(BaseOutput):
-    """
-    Output class for Stable Diffusion pipelines.
-
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
-            num_channels)`.
-        nsfw_content_detected (`List[bool]`)
-            List indicating whether the corresponding generated image contains “not-safe-for-work” (nsfw) content or
-            `None` if safety checking could not be performed.
-    """
-
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_content_detected: Optional[List[bool]]
+_import_structure = {}
+_dummy_objects = {}
 
 
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
 else:
-    from .pipeline_semantic_stable_diffusion import SemanticStableDiffusionPipeline
+    _import_structure["pipeline_output"] = ["SemanticStableDiffusionPipelineOutput"]
+    _import_structure["pipeline_semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
+
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py
new file mode 100644
index 000000000000..172715da864e
--- /dev/null
+++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py
@@ -0,0 +1,25 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class SemanticStableDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains “not-safe-for-work” (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
index b9ad42c4722d..c27b03968ec1 100644
--- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
+++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
@@ -9,7 +9,8 @@
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import SemanticStableDiffusionPipelineOutput
 
diff --git a/src/diffusers/pipelines/shap_e/__init__.py b/src/diffusers/pipelines/shap_e/__init__.py
index 04aa1f2f6d78..2a56148fee91 100644
--- a/src/diffusers/pipelines/shap_e/__init__.py
+++ b/src/diffusers/pipelines/shap_e/__init__.py
@@ -1,27 +1,47 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
     is_torch_available,
     is_transformers_available,
-    is_transformers_version,
 )
 
 
+_import_structure = {}
+_dummy_objects = {}
+
+
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import ShapEPipeline
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    from .camera import create_pan_cameras
-    from .pipeline_shap_e import ShapEPipeline
-    from .pipeline_shap_e_img2img import ShapEImg2ImgPipeline
-    from .renderer import (
-        BoundingBoxVolume,
-        ImportanceRaySampler,
-        MLPNeRFModelOutput,
-        MLPNeRSTFModel,
-        ShapEParamsProjModel,
-        ShapERenderer,
-        StratifiedRaySampler,
-        VoidNeRFModel,
-    )
+    _import_structure["camera"] = ["create_pan_cameras"]
+    _import_structure["pipeline_shap_e"] = ["ShapEPipeline"]
+    _import_structure["pipeline_shap_e_img2img"] = ["ShapEImg2ImgPipeline"]
+    _import_structure["renderer"] = [
+        "BoundingBoxVolume",
+        "ImportanceRaySampler",
+        "MLPNeRFModelOutput",
+        "MLPNeRSTFModel",
+        "ShapEParamsProjModel",
+        "ShapERenderer",
+        "StratifiedRaySampler",
+        "VoidNeRFModel",
+    ]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 266075d93b30..7a6cd4589a0a 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -28,9 +28,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from .renderer import ShapERenderer
 
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 6aa75ca0d541..a8ef7aa09027 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -25,9 +25,9 @@
 from ...utils import (
     BaseOutput,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from .renderer import ShapERenderer
 
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
index 05b14a857630..e8bcf63c2986 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -1,21 +1,33 @@
 # flake8: noqa
-from ...utils import is_note_seq_available, is_transformers_available, is_torch_available
-from ...utils import OptionalDependencyNotAvailable
+from ...utils import (
+    _LazyModule,
+    is_note_seq_available,
+    OptionalDependencyNotAvailable,
+    is_torch_available,
+    is_transformers_available,
+    get_objects_from_module,
+)
+
+_import_structure = {}
+_dummy_objects = {}
 
 
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
 else:
-    from .notes_encoder import SpectrogramNotesEncoder
-    from .continous_encoder import SpectrogramContEncoder
-    from .pipeline_spectrogram_diffusion import (
-        SpectrogramContEncoder,
-        SpectrogramDiffusionPipeline,
-        T5FilmDecoder,
-    )
+    _import_structure["notes_encoder"] = ["SpectrogramNotesEncoder"]
+    _import_structure["continous_encoder"] = ["SpectrogramContEncoder"]
+    _import_structure["pipeline_spectrogram_diffusion"] = [
+        "SpectrogramContEncoder",
+        "SpectrogramDiffusionPipeline",
+        "T5FilmDecoder",
+    ]
 
 try:
     if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
@@ -23,4 +35,16 @@
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
 else:
-    from .midi_utils import MidiProcessor
+    _import_structure["midi_utils"] = ["MidiProcessor"]
+
+import sys
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index bb3922e77fd1..5ab503df49ca 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -21,7 +21,8 @@
 
 from ...models import T5FilmDecoder
 from ...schedulers import DDPMScheduler
-from ...utils import is_onnx_available, logging, randn_tensor
+from ...utils import is_onnx_available, logging
+from ...utils.torch_utils import randn_tensor
 
 
 if is_onnx_available():
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index b92081434556..f6f3327c5fb6 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -1,13 +1,7 @@
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL
-from PIL import Image
-
 from ...utils import (
-    BaseOutput,
     OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
     is_flax_available,
     is_k_diffusion_available,
     is_k_diffusion_version,
@@ -18,59 +12,56 @@
 )
 
 
-@dataclass
-class StableDiffusionPipelineOutput(BaseOutput):
-    """
-    Output class for Stable Diffusion pipelines.
-
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
-            num_channels)`.
-        nsfw_content_detected (`List[bool]`)
-            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
-            `None` if safety checking could not be performed.
-    """
+_import_structure = {}
+_additional_imports = {}
+_dummy_objects = {}
 
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_content_detected: Optional[List[bool]]
+_import_structure["pipeline_output"] = ["StableDiffusionPipelineOutput"]
 
+if is_transformers_available() and is_flax_available():
+    _import_structure["pipeline_output"].extend(["FlaxStableDiffusionPipelineOutput"])
 
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
 else:
-    from .clip_image_project_model import CLIPImageProjection
-    from .pipeline_cycle_diffusion import CycleDiffusionPipeline
-    from .pipeline_stable_diffusion import StableDiffusionPipeline
-    from .pipeline_stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
-    from .pipeline_stable_diffusion_gligen import StableDiffusionGLIGENPipeline
-    from .pipeline_stable_diffusion_gligen_text_image import StableDiffusionGLIGENTextImagePipeline
-    from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
-    from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
-    from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
-    from .pipeline_stable_diffusion_instruct_pix2pix import StableDiffusionInstructPix2PixPipeline
-    from .pipeline_stable_diffusion_latent_upscale import StableDiffusionLatentUpscalePipeline
-    from .pipeline_stable_diffusion_ldm3d import StableDiffusionLDM3DPipeline
-    from .pipeline_stable_diffusion_model_editing import StableDiffusionModelEditingPipeline
-    from .pipeline_stable_diffusion_panorama import StableDiffusionPanoramaPipeline
-    from .pipeline_stable_diffusion_paradigms import StableDiffusionParadigmsPipeline
-    from .pipeline_stable_diffusion_sag import StableDiffusionSAGPipeline
-    from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline
-    from .pipeline_stable_unclip import StableUnCLIPPipeline
-    from .pipeline_stable_unclip_img2img import StableUnCLIPImg2ImgPipeline
-    from .safety_checker import StableDiffusionSafetyChecker
-    from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+    _import_structure["pipeline_cycle_diffusion"] = ["CycleDiffusionPipeline"]
+    _import_structure["pipeline_stable_diffusion"] = ["StableDiffusionPipeline"]
+    _import_structure["pipeline_stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
+    _import_structure["pipeline_stable_diffusion_gligen"] = ["StableDiffusionGLIGENPipeline"]
+    _import_structure["pipeline_stable_diffusion_img2img"] = ["StableDiffusionImg2ImgPipeline"]
+    _import_structure["pipeline_stable_diffusion_inpaint"] = ["StableDiffusionInpaintPipeline"]
+    _import_structure["pipeline_stable_diffusion_inpaint_legacy"] = ["StableDiffusionInpaintPipelineLegacy"]
+    _import_structure["pipeline_stable_diffusion_instruct_pix2pix"] = ["StableDiffusionInstructPix2PixPipeline"]
+    _import_structure["pipeline_stable_diffusion_latent_upscale"] = ["StableDiffusionLatentUpscalePipeline"]
+    _import_structure["pipeline_stable_diffusion_ldm3d"] = ["StableDiffusionLDM3DPipeline"]
+    _import_structure["pipeline_stable_diffusion_model_editing"] = ["StableDiffusionModelEditingPipeline"]
+    _import_structure["pipeline_stable_diffusion_panorama"] = ["StableDiffusionPanoramaPipeline"]
+    _import_structure["pipeline_stable_diffusion_paradigms"] = ["StableDiffusionParadigmsPipeline"]
+    _import_structure["pipeline_stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"]
+    _import_structure["pipeline_stable_diffusion_upscale"] = ["StableDiffusionUpscalePipeline"]
+    _import_structure["pipeline_stable_unclip"] = ["StableUnCLIPPipeline"]
+    _import_structure["pipeline_stable_unclip_img2img"] = ["StableUnCLIPImg2ImgPipeline"]
+    _import_structure["safety_checker"] = ["StableDiffusionSafetyChecker"]
+    _import_structure["stable_unclip_image_normalizer"] = ["StableUnCLIPImageNormalizer"]
+    _import_structure["pipeline_stable_diffusion_gligen_text_image"] = ["StableDiffusionGLIGENTextImagePipeline"]
+    _import_structure["pipeline_stable_diffusion_gligen"] = ["StableDiffusionGLIGENPipeline"]
+    _import_structure["clip_image_project_model"] = ["CLIPImageProjection"]
 
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_torch_and_transformers_objects import StableDiffusionImageVariationPipeline
+
+    _dummy_objects.update({"StableDiffusionImageVariationPipeline": StableDiffusionImageVariationPipeline})
 else:
-    from .pipeline_stable_diffusion_image_variation import StableDiffusionImageVariationPipeline
+    _import_structure["pipeline_stable_diffusion_image_variation"] = ["StableDiffusionImageVariationPipeline"]
 
 
 try:
@@ -82,10 +73,18 @@ class StableDiffusionPipelineOutput(BaseOutput):
         StableDiffusionDiffEditPipeline,
         StableDiffusionPix2PixZeroPipeline,
     )
+
+    _dummy_objects.update(
+        {
+            "StableDiffusionDepth2ImgPipeline": StableDiffusionDepth2ImgPipeline,
+            "StableDiffusionDiffEditPipeline": StableDiffusionDiffEditPipeline,
+            "StableDiffusionPix2PixZeroPipeline": StableDiffusionPix2PixZeroPipeline,
+        }
+    )
 else:
-    from .pipeline_stable_diffusion_depth2img import StableDiffusionDepth2ImgPipeline
-    from .pipeline_stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
-    from .pipeline_stable_diffusion_pix2pix_zero import StableDiffusionPix2PixZeroPipeline
+    _import_structure["pipeline_stable_diffusion_depth2img"] = ["StableDiffusionDepth2ImgPipeline"]
+    _import_structure["pipeline_stable_diffusion_diffedit"] = ["StableDiffusionDiffEditPipeline"]
+    _import_structure["pipeline_stable_diffusion_pix2pix_zero"] = ["StableDiffusionPix2PixZeroPipeline"]
 
 
 try:
@@ -97,43 +96,52 @@ class StableDiffusionPipelineOutput(BaseOutput):
     ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_and_k_diffusion_objects import *  # noqa F403
+    from ...utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
+
 else:
-    from .pipeline_stable_diffusion_k_diffusion import StableDiffusionKDiffusionPipeline
+    _import_structure["pipeline_stable_diffusion_k_diffusion"] = ["StableDiffusionKDiffusionPipeline"]
 
 try:
     if not (is_transformers_available() and is_onnx_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_onnx_objects import *  # noqa F403
+    from ...utils import dummy_onnx_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_onnx_objects))
+
 else:
-    from .pipeline_onnx_stable_diffusion import OnnxStableDiffusionPipeline, StableDiffusionOnnxPipeline
-    from .pipeline_onnx_stable_diffusion_img2img import OnnxStableDiffusionImg2ImgPipeline
-    from .pipeline_onnx_stable_diffusion_inpaint import OnnxStableDiffusionInpaintPipeline
-    from .pipeline_onnx_stable_diffusion_inpaint_legacy import OnnxStableDiffusionInpaintPipelineLegacy
-    from .pipeline_onnx_stable_diffusion_upscale import OnnxStableDiffusionUpscalePipeline
+    _import_structure["pipeline_onnx_stable_diffusion"] = [
+        "OnnxStableDiffusionPipeline",
+        "StableDiffusionOnnxPipeline",
+    ]
+    _import_structure["pipeline_onnx_stable_diffusion_img2img"] = ["OnnxStableDiffusionImg2ImgPipeline"]
+    _import_structure["pipeline_onnx_stable_diffusion_inpaint"] = ["OnnxStableDiffusionInpaintPipeline"]
+    _import_structure["pipeline_onnx_stable_diffusion_inpaint_legacy"] = ["OnnxStableDiffusionInpaintPipelineLegacy"]
+    _import_structure["pipeline_onnx_stable_diffusion_upscale"] = ["OnnxStableDiffusionUpscalePipeline"]
 
 if is_transformers_available() and is_flax_available():
-    import flax
+    from ...schedulers.scheduling_pndm_flax import PNDMSchedulerState
 
-    @flax.struct.dataclass
-    class FlaxStableDiffusionPipelineOutput(BaseOutput):
-        """
-        Output class for Flax-based Stable Diffusion pipelines.
+    _additional_imports.update({"PNDMSchedulerState": PNDMSchedulerState})
 
-        Args:
-            images (`np.ndarray`):
-                Denoised images of array shape of `(batch_size, height, width, num_channels)`.
-            nsfw_content_detected (`List[bool]`):
-                List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content
-                or `None` if safety checking could not be performed.
-        """
+    _import_structure["pipeline_flax_stable_diffusion"] = ["FlaxStableDiffusionPipeline"]
+    _import_structure["pipeline_flax_stable_diffusion_img2img"] = ["FlaxStableDiffusionImg2ImgPipeline"]
+    _import_structure["pipeline_flax_stable_diffusion_inpaint"] = ["FlaxStableDiffusionInpaintPipeline"]
+    _import_structure["safety_checker_flax"] = ["FlaxStableDiffusionSafetyChecker"]
 
-        images: np.ndarray
-        nsfw_content_detected: List[bool]
+import sys
 
-    from ...schedulers.scheduling_pndm_flax import PNDMSchedulerState
-    from .pipeline_flax_stable_diffusion import FlaxStableDiffusionPipeline
-    from .pipeline_flax_stable_diffusion_img2img import FlaxStableDiffusionImg2ImgPipeline
-    from .pipeline_flax_stable_diffusion_inpaint import FlaxStableDiffusionInpaintPipeline
-    from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
+for name, value in _additional_imports.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index 9a3b828828e3..6896ef94a3cf 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -29,7 +29,8 @@
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import DDIMScheduler
-from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
+from ...utils import PIL_INTERPOLATION, deprecate, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_output.py b/src/diffusers/pipelines/stable_diffusion/pipeline_output.py
new file mode 100644
index 000000000000..0ac9d9e1a039
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_output.py
@@ -0,0 +1,49 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+
+from ...utils import (
+    BaseOutput,
+    is_flax_available,
+    is_transformers_available,
+)
+
+
+@dataclass
+class StableDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+
+
+if is_transformers_available() and is_flax_available():
+    import flax
+
+    @flax.struct.dataclass
+    class FlaxStableDiffusionPipelineOutput(BaseOutput):
+        """
+        Output class for Flax-based Stable Diffusion pipelines.
+
+        Args:
+            images (`np.ndarray`):
+                Denoised images of array shape of `(batch_size, height, width, num_channels)`.
+            nsfw_content_detected (`List[bool]`):
+                List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content
+                or `None` if safety checking could not be performed.
+        """
+
+        images: np.ndarray
+        nsfw_content_detected: List[bool]
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 6faec1f9a140..a84b316bbf62 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -30,9 +30,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
index b5f94add9f18..d64e02e8ecd0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
@@ -27,7 +27,8 @@
 from ...models.attention_processor import Attention
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
+from ...utils import deprecate, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 0ab0b85a46c2..3be87fe641f6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -28,7 +28,8 @@
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
+from ...utils import PIL_INTERPOLATION, deprecate, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
index 261dabe46754..13522fa780ca 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
@@ -35,9 +35,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py
index 78d0e852a632..7748896524c0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py
@@ -31,9 +31,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
index 0940b830065c..01cef5438a1e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
@@ -36,9 +36,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .clip_image_project_model import CLIPImageProjection
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index d6214b8c041c..328e7165a188 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -24,7 +24,8 @@
 from ...image_processor import VaeImageProcessor
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 5e7f5f01cb28..13d971de2844 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -33,9 +33,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index c1fb5831a305..a01442df5ce8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -27,7 +27,8 @@
 from ...models import AsymmetricAutoencoderKL, AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
+from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index f5b60d95e543..3be6fc93e970 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -27,14 +27,8 @@
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (
-    PIL_INTERPOLATION,
-    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
-    logging,
-    randn_tensor,
-)
+from ...utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index 8afaec267c9b..8ed36f771db9 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -24,14 +24,8 @@
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (
-    PIL_INTERPOLATION,
-    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
-    logging,
-    randn_tensor,
-)
+from ...utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
index 92e481e707c3..f4509cd4a960 100755
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -24,7 +24,8 @@
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import LMSDiscreteScheduler
-from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
+from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
index 323a583d4558..4141b65f5096 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -24,7 +24,8 @@
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import EulerDiscreteScheduler
-from ...utils import deprecate, logging, randn_tensor
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
index 13ccb226b0d7..3400497670c9 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
@@ -32,9 +32,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from .safety_checker import StableDiffusionSafetyChecker
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
index 0b96a2cc8195..a92515cfb4a5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
@@ -24,7 +24,8 @@
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import PNDMScheduler
 from ...schedulers.scheduling_utils import SchedulerMixin
-from ...utils import deprecate, logging, randn_tensor
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
index 84bd9f7e8815..0956bfefa372 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
@@ -23,7 +23,8 @@
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import DDIMScheduler
-from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
+from ...utils import deprecate, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py
index 7ce3dfc35908..cf597ac062bf 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py
@@ -28,9 +28,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index f2b281e8c6c7..be3ffa4071eb 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -42,9 +42,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
index 539696e9d5b6..7580c11936c0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
@@ -24,7 +24,8 @@
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
+from ...utils import deprecate, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index d8700d582f5e..4e5e77a5e2db 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -32,7 +32,8 @@
 )
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
-from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
+from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
index 10207d0ba32d..2ac9a52570ca 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
@@ -30,9 +30,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
index 1a7427c21bc5..dae0846ea64b 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -27,7 +27,8 @@
 from ...models.embeddings import get_timestep_embedding
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, is_accelerate_version, logging, randn_tensor, replace_example_docstring
+from ...utils import deprecate, is_accelerate_version, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
 
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
index 10bb7418e2c3..88b6e29f4b21 100644
--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -10,7 +10,8 @@
 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionSafePipelineOutput
 from .safety_checker import SafeStableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/__init__.py b/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
index 02bd96cfc23c..ebe12db15fd9 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
@@ -1,38 +1,39 @@
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL
-
 from ...utils import (
-    BaseOutput,
     OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
     is_torch_available,
     is_transformers_available,
 )
 
 
-@dataclass
-class StableDiffusionXLPipelineOutput(BaseOutput):
-    """
-    Output class for Stable Diffusion pipelines.
-
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-    """
-
-    images: Union[List[PIL.Image.Image], np.ndarray]
+_import_structure = {}
+_dummy_objects = {}
 
+_import_structure["pipeline_output"] = ["StableDiffusionXLPipelineOutput"]
 
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    from .pipeline_stable_diffusion_xl import StableDiffusionXLPipeline
-    from .pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipeline
-    from .pipeline_stable_diffusion_xl_inpaint import StableDiffusionXLInpaintPipeline
-    from .pipeline_stable_diffusion_xl_instruct_pix2pix import StableDiffusionXLInstructPix2PixPipeline
+    _import_structure["pipeline_stable_diffusion_xl"] = ["StableDiffusionXLPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_img2img"] = ["StableDiffusionXLImg2ImgPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_inpaint"] = ["StableDiffusionXLInpaintPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_instruct_pix2pix"] = ["StableDiffusionXLInstructPix2PixPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py
new file mode 100644
index 000000000000..0c9515da34ef
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class StableDiffusionXLPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 7b7755085ed6..81c783bdfd2f 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -39,9 +39,9 @@
     is_accelerate_version,
     is_invisible_watermark_available,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionXLPipelineOutput
 
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 04902234d54e..5af3b07f28a3 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -36,9 +36,9 @@
     is_accelerate_version,
     is_invisible_watermark_available,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionXLPipelineOutput
 
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 1d86dff702ef..c47b53b53bef 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -38,9 +38,9 @@
     is_accelerate_version,
     is_invisible_watermark_available,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionXLPipelineOutput
 
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
index fe9fc1a53d32..c283f5bade68 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -36,9 +36,9 @@
     is_accelerate_version,
     is_invisible_watermark_available,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionXLPipelineOutput
 
diff --git a/src/diffusers/pipelines/stochastic_karras_ve/__init__.py b/src/diffusers/pipelines/stochastic_karras_ve/__init__.py
index 5a63c1d24afb..2f82b438c5e3 100644
--- a/src/diffusers/pipelines/stochastic_karras_ve/__init__.py
+++ b/src/diffusers/pipelines/stochastic_karras_ve/__init__.py
@@ -1 +1,15 @@
-from .pipeline_stochastic_karras_ve import KarrasVePipeline
+from ...utils import _LazyModule
+
+
+_import_structure = {}
+_import_structure["pipeline_stochastic_karras_ve"] = ["KarrasVePipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py b/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
index 61b5ed2d160f..d850f5a73351 100644
--- a/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
+++ b/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
@@ -18,7 +18,7 @@
 
 from ...models import UNet2DModel
 from ...schedulers import KarrasVeScheduler
-from ...utils import randn_tensor
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/t2i_adapter/__init__.py b/src/diffusers/pipelines/t2i_adapter/__init__.py
index a9a81df36a1a..b6e6ee724a67 100644
--- a/src/diffusers/pipelines/t2i_adapter/__init__.py
+++ b/src/diffusers/pipelines/t2i_adapter/__init__.py
@@ -1,15 +1,34 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
     is_torch_available,
     is_transformers_available,
 )
 
 
+_import_structure = {}
+_dummy_objects = {}
+
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    from .pipeline_stable_diffusion_adapter import StableDiffusionAdapterPipeline
-    from .pipeline_stable_diffusion_xl_adapter import StableDiffusionXLAdapterPipeline
+    _import_structure["pipeline_stable_diffusion_adapter"] = ["StableDiffusionAdapterPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_adapter"] = ["StableDiffusionXLAdapterPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
index 93b5f3b25d8b..8884c94eb72e 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
@@ -33,9 +33,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
index 9809e1dddfee..5116dd8f7b52 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -38,9 +38,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 
 
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/__init__.py b/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
index 97683885aac9..af3b9bfde1ce 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
@@ -1,32 +1,35 @@
-from dataclasses import dataclass
-from typing import List, Optional, Union
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
 
-import numpy as np
-import torch
-
-from ...utils import BaseOutput, OptionalDependencyNotAvailable, is_torch_available, is_transformers_available
-
-
-@dataclass
-class TextToVideoSDPipelineOutput(BaseOutput):
-    """
-    Output class for text-to-video pipelines.
-
-    Args:
-        frames (`List[np.ndarray]` or `torch.FloatTensor`)
-            List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as
-            a `torch` tensor. The length of the list denotes the video length (the number of frames).
-    """
-
-    frames: Union[List[np.ndarray], torch.FloatTensor]
 
+_import_structure = {}
+_dummy_objects = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
 else:
-    from .pipeline_text_to_video_synth import TextToVideoSDPipeline
-    from .pipeline_text_to_video_synth_img2img import VideoToVideoSDPipeline  # noqa: F401
-    from .pipeline_text_to_video_zero import TextToVideoZeroPipeline
+    _import_structure["pipeline_output"] = ["TextToVideoSDPipelineOutput"]
+    _import_structure["pipeline_text_to_video_synth"] = ["TextToVideoSDPipeline"]
+    _import_structure["pipeline_text_to_video_synth_img2img"] = ["VideoToVideoSDPipeline"]
+    _import_structure["pipeline_text_to_video_zero"] = ["TextToVideoZeroPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
new file mode 100644
index 000000000000..411515809e6f
--- /dev/null
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
@@ -0,0 +1,23 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import torch
+
+from ...utils import (
+    BaseOutput,
+)
+
+
+@dataclass
+class TextToVideoSDPipelineOutput(BaseOutput):
+    """
+    Output class for text-to-video pipelines.
+
+    Args:
+        frames (`List[np.ndarray]` or `torch.FloatTensor`)
+            List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as
+            a `torch` tensor. The length of the list denotes the video length (the number of frames).
+    """
+
+    frames: Union[List[np.ndarray], torch.FloatTensor]
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
index 72063769c868..678c2fbff438 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -28,9 +28,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import TextToVideoSDPipelineOutput
 
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
index cb0c24c474a4..b7a4bfdd8859 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
@@ -29,9 +29,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import TextToVideoSDPipelineOutput
 
diff --git a/src/diffusers/pipelines/unclip/__init__.py b/src/diffusers/pipelines/unclip/__init__.py
index 075e66bb680a..f546dbb5041d 100644
--- a/src/diffusers/pipelines/unclip/__init__.py
+++ b/src/diffusers/pipelines/unclip/__init__.py
@@ -1,17 +1,38 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
     is_torch_available,
     is_transformers_available,
     is_transformers_version,
 )
 
 
+_import_structure = {}
+_dummy_objects = {}
+
+
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_torch_and_transformers_objects import UnCLIPImageVariationPipeline, UnCLIPPipeline
+
+    _dummy_objects.update(
+        {"UnCLIPImageVariationPipeline": UnCLIPImageVariationPipeline, "UnCLIPPipeline": UnCLIPPipeline}
+    )
 else:
-    from .pipeline_unclip import UnCLIPPipeline
-    from .pipeline_unclip_image_variation import UnCLIPImageVariationPipeline
-    from .text_proj import UnCLIPTextProjModel
+    _import_structure["pipeline_unclip"] = ["UnCLIPPipeline"]
+    _import_structure["pipeline_unclip_image_variation"] = ["UnCLIPImageVariationPipeline"]
+    _import_structure["text_proj"] = ["UnCLIPTextProjModel"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip.py b/src/diffusers/pipelines/unclip/pipeline_unclip.py
index 92d42bf0c75e..7e8dc22f6ca2 100644
--- a/src/diffusers/pipelines/unclip/pipeline_unclip.py
+++ b/src/diffusers/pipelines/unclip/pipeline_unclip.py
@@ -22,7 +22,8 @@
 
 from ...models import PriorTransformer, UNet2DConditionModel, UNet2DModel
 from ...schedulers import UnCLIPScheduler
-from ...utils import logging, randn_tensor
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .text_proj import UnCLIPTextProjModel
 
diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
index f22ede9dede9..8ec917f9e297 100644
--- a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
+++ b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
@@ -27,7 +27,8 @@
 
 from ...models import UNet2DConditionModel, UNet2DModel
 from ...schedulers import UnCLIPScheduler
-from ...utils import logging, randn_tensor
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .text_proj import UnCLIPTextProjModel
 
diff --git a/src/diffusers/pipelines/unidiffuser/__init__.py b/src/diffusers/pipelines/unidiffuser/__init__.py
index a774e3274030..ac0207b6045d 100644
--- a/src/diffusers/pipelines/unidiffuser/__init__.py
+++ b/src/diffusers/pipelines/unidiffuser/__init__.py
@@ -1,11 +1,15 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
     is_torch_available,
     is_transformers_available,
-    is_transformers_version,
 )
 
 
+_import_structure = {}
+_dummy_objects = {}
+
+
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
@@ -14,7 +18,25 @@
         ImageTextPipelineOutput,
         UniDiffuserPipeline,
     )
+
+    _dummy_objects.update(
+        {"ImageTextPipelineOutput": ImageTextPipelineOutput, "UniDiffuserPipeline": UniDiffuserPipeline}
+    )
+
 else:
-    from .modeling_text_decoder import UniDiffuserTextDecoder
-    from .modeling_uvit import UniDiffuserModel, UTransformer2DModel
-    from .pipeline_unidiffuser import ImageTextPipelineOutput, UniDiffuserPipeline
+    _import_structure["modeling_text_decoder"] = ["UniDiffuserTextDecoder"]
+    _import_structure["modeling_uvit"] = ["UniDiffuserModel", "UTransformer2DModel"]
+    _import_structure["pipeline_unidiffuser"] = ["ImageTextPipelineOutput", "UniDiffuserPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
index 670c915c6de1..2fcb89734089 100644
--- a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
+++ b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
@@ -15,15 +15,9 @@
 
 from ...models import AutoencoderKL
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (
-    PIL_INTERPOLATION,
-    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
-    logging,
-    randn_tensor,
-)
+from ...utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, is_accelerate_version, logging
 from ...utils.outputs import BaseOutput
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from .modeling_text_decoder import UniDiffuserTextDecoder
 from .modeling_uvit import UniDiffuserModel
diff --git a/src/diffusers/pipelines/versatile_diffusion/__init__.py b/src/diffusers/pipelines/versatile_diffusion/__init__.py
index abf9dcff59db..8fbe932b18a6 100644
--- a/src/diffusers/pipelines/versatile_diffusion/__init__.py
+++ b/src/diffusers/pipelines/versatile_diffusion/__init__.py
@@ -1,11 +1,16 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
     is_torch_available,
     is_transformers_available,
     is_transformers_version,
 )
 
 
+_import_structure = {}
+_dummy_objects = {}
+
+
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
         raise OptionalDependencyNotAvailable()
@@ -16,9 +21,31 @@
         VersatileDiffusionPipeline,
         VersatileDiffusionTextToImagePipeline,
     )
+
+    _dummy_objects.update(
+        {
+            "VersatileDiffusionDualGuidedPipeline": VersatileDiffusionDualGuidedPipeline,
+            "VersatileDiffusionImageVariationPipeline": VersatileDiffusionImageVariationPipeline,
+            "VersatileDiffusionPipeline": VersatileDiffusionPipeline,
+            "VersatileDiffusionTextToImagePipeline": VersatileDiffusionTextToImagePipeline,
+        }
+    )
 else:
-    from .modeling_text_unet import UNetFlatConditionModel
-    from .pipeline_versatile_diffusion import VersatileDiffusionPipeline
-    from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline
-    from .pipeline_versatile_diffusion_image_variation import VersatileDiffusionImageVariationPipeline
-    from .pipeline_versatile_diffusion_text_to_image import VersatileDiffusionTextToImagePipeline
+    _import_structure["modeling_text_unet"] = ["UNetFlatConditionModel"]
+    _import_structure["pipeline_versatile_diffusion"] = ["VersatileDiffusionPipeline"]
+    _import_structure["pipeline_versatile_diffusion_dual_guided"] = ["VersatileDiffusionDualGuidedPipeline"]
+    _import_structure["pipeline_versatile_diffusion_image_variation"] = ["VersatileDiffusionImageVariationPipeline"]
+    _import_structure["pipeline_versatile_diffusion_text_to_image"] = ["VersatileDiffusionTextToImagePipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
+
+for name, value in _dummy_objects.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
index 9bd724429e5d..cbb91e8a9e9a 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -29,7 +29,8 @@
 from ...image_processor import VaeImageProcessor
 from ...models import AutoencoderKL, DualTransformer2DModel, Transformer2DModel, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .modeling_text_unet import UNetFlatConditionModel
 
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
index da6f9bf23589..f06aa4b45d4d 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -24,7 +24,8 @@
 from ...image_processor import VaeImageProcessor
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
index a443bc9d2225..f2d3aebce2b6 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -22,7 +22,8 @@
 from ...image_processor import VaeImageProcessor
 from ...models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .modeling_text_unet import UNetFlatConditionModel
 
diff --git a/src/diffusers/pipelines/vq_diffusion/__init__.py b/src/diffusers/pipelines/vq_diffusion/__init__.py
index da60bf73ad42..8917802c2694 100644
--- a/src/diffusers/pipelines/vq_diffusion/__init__.py
+++ b/src/diffusers/pipelines/vq_diffusion/__init__.py
@@ -1,10 +1,39 @@
-from ...utils import OptionalDependencyNotAvailable, is_torch_available, is_transformers_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_import_structure = {}
+_dummy_objects = {}
 
 
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import *
+    from ...utils.dummy_torch_and_transformers_objects import (
+        LearnedClassifierFreeSamplingEmbeddings,
+        VQDiffusionPipeline,
+    )
+
+    _dummy_objects.update(
+        {
+            "LearnedClassifierFreeSamplingEmbeddings": LearnedClassifierFreeSamplingEmbeddings,
+            "VQDiffusionPipeline": VQDiffusionPipeline,
+        }
+    )
 else:
-    from .pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings, VQDiffusionPipeline
+    _import_structure["pipeline_vq_diffusion"] = ["LearnedClassifierFreeSamplingEmbeddings", "VQDiffusionPipeline"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/wuerstchen/__init__.py b/src/diffusers/pipelines/wuerstchen/__init__.py
index a6f6321b048a..f77b597a0b92 100644
--- a/src/diffusers/pipelines/wuerstchen/__init__.py
+++ b/src/diffusers/pipelines/wuerstchen/__init__.py
@@ -1,10 +1,38 @@
-from ...utils import is_torch_available, is_transformers_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
 
 
-if is_transformers_available() and is_torch_available():
-    from .modeling_paella_vq_model import PaellaVQModel
-    from .modeling_wuerstchen_diffnext import WuerstchenDiffNeXt
-    from .modeling_wuerstchen_prior import WuerstchenPrior
-    from .pipeline_wuerstchen import WuerstchenDecoderPipeline
-    from .pipeline_wuerstchen_combined import WuerstchenCombinedPipeline
-    from .pipeline_wuerstchen_prior import WuerstchenPriorPipeline
+_import_structure = {}
+_dummy_objects = {}
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
+else:
+    _import_structure["modeling_paella_vq_model"] = ["PaellaVQModel"]
+    _import_structure["modeling_wuerstchen_diffnext"] = ["WuerstchenDiffNeXt"]
+    _import_structure["modeling_wuerstchen_prior"] = ["WuerstchenPrior"]
+    _import_structure["pipeline_wuerstchen"] = ["WuerstchenDecoderPipeline"]
+    _import_structure["pipeline_wuerstchen_combined"] = ["WuerstchenCombinedPipeline"]
+    _import_structure["pipeline_wuerstchen_prior"] = ["WuerstchenPriorPipeline"]
+
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(
+    __name__,
+    globals()["__file__"],
+    _import_structure,
+    module_spec=__spec__,
+)
diff --git a/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py b/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
index 09bdd16592df..7ee42faa0e82 100644
--- a/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
+++ b/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
@@ -22,7 +22,7 @@
 from ...models.modeling_utils import ModelMixin
 from ...models.vae import DecoderOutput, VectorQuantizer
 from ...models.vq_model import VQEncoderOutput
-from ...utils import apply_forward_hook
+from ...utils.accelerate_utils import apply_forward_hook
 
 
 class MixingResidualBlock(nn.Module):
diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
index 78aeebed7943..7f6b0546da7b 100644
--- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
+++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
@@ -19,7 +19,8 @@
 from transformers import CLIPTextModel, CLIPTokenizer
 
 from ...schedulers import DDPMWuerstchenScheduler
-from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring
+from ...utils import is_accelerate_available, is_accelerate_version, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .modeling_paella_vq_model import PaellaVQModel
 from .modeling_wuerstchen_diffnext import WuerstchenDiffNeXt
diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
index 8b13d8fdf2b7..297462bd96f7 100644
--- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
+++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
@@ -26,9 +26,9 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
+from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from .modeling_wuerstchen_prior import WuerstchenPrior
 
diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
index 84df4ffb84db..270e10cdbe18 100644
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -15,6 +15,7 @@
 
 from ..utils import (
     OptionalDependencyNotAvailable,
+    _LazyModule,
     is_flax_available,
     is_scipy_available,
     is_torch_available,
@@ -22,38 +23,49 @@
 )
 
 
+_import_structure = {}
+_dummy_modules = {}
+
 try:
     if not is_torch_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_pt_objects import *  # noqa F403
+    from ..utils import dummy_pt_objects  # noqa F403
+
+    modules = {}
+    for name in dir(dummy_pt_objects):
+        if (not name.endswith("Scheduler")) or name.startswith("_"):
+            continue
+        modules[name] = getattr(dummy_pt_objects, name)
+    _dummy_modules.update(modules)
+
 else:
-    from .scheduling_consistency_models import CMStochasticIterativeScheduler
-    from .scheduling_ddim import DDIMScheduler
-    from .scheduling_ddim_inverse import DDIMInverseScheduler
-    from .scheduling_ddim_parallel import DDIMParallelScheduler
-    from .scheduling_ddpm import DDPMScheduler
-    from .scheduling_ddpm_parallel import DDPMParallelScheduler
-    from .scheduling_ddpm_wuerstchen import DDPMWuerstchenScheduler
-    from .scheduling_deis_multistep import DEISMultistepScheduler
-    from .scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
-    from .scheduling_dpmsolver_multistep_inverse import DPMSolverMultistepInverseScheduler
-    from .scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
-    from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
-    from .scheduling_euler_discrete import EulerDiscreteScheduler
-    from .scheduling_heun_discrete import HeunDiscreteScheduler
-    from .scheduling_ipndm import IPNDMScheduler
-    from .scheduling_k_dpm_2_ancestral_discrete import KDPM2AncestralDiscreteScheduler
-    from .scheduling_k_dpm_2_discrete import KDPM2DiscreteScheduler
-    from .scheduling_karras_ve import KarrasVeScheduler
-    from .scheduling_pndm import PNDMScheduler
-    from .scheduling_repaint import RePaintScheduler
-    from .scheduling_sde_ve import ScoreSdeVeScheduler
-    from .scheduling_sde_vp import ScoreSdeVpScheduler
-    from .scheduling_unclip import UnCLIPScheduler
-    from .scheduling_unipc_multistep import UniPCMultistepScheduler
-    from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
-    from .scheduling_vq_diffusion import VQDiffusionScheduler
+    _import_structure["scheduling_consistency_models"] = ["CMStochasticIterativeScheduler"]
+    _import_structure["scheduling_ddim"] = ["DDIMScheduler"]
+    _import_structure["scheduling_ddim_inverse"] = ["DDIMInverseScheduler"]
+    _import_structure["scheduling_ddim_parallel"] = ["DDIMParallelScheduler"]
+    _import_structure["scheduling_ddpm"] = ["DDPMScheduler"]
+    _import_structure["scheduling_ddpm_parallel"] = ["DDPMParallelScheduler"]
+    _import_structure["scheduling_deis_multistep"] = ["DEISMultistepScheduler"]
+    _import_structure["scheduling_dpmsolver_multistep"] = ["DPMSolverMultistepScheduler"]
+    _import_structure["scheduling_dpmsolver_multistep_inverse"] = ["DPMSolverMultistepInverseScheduler"]
+    _import_structure["scheduling_dpmsolver_singlestep"] = ["DPMSolverSinglestepScheduler"]
+    _import_structure["scheduling_euler_ancestral_discrete"] = ["EulerAncestralDiscreteScheduler"]
+    _import_structure["scheduling_euler_discrete"] = ["EulerDiscreteScheduler"]
+    _import_structure["scheduling_heun_discrete"] = ["HeunDiscreteScheduler"]
+    _import_structure["scheduling_ipndm"] = ["IPNDMScheduler"]
+    _import_structure["scheduling_k_dpm_2_ancestral_discrete"] = ["KDPM2AncestralDiscreteScheduler"]
+    _import_structure["scheduling_k_dpm_2_discrete"] = ["KDPM2DiscreteScheduler"]
+    _import_structure["scheduling_karras_ve"] = ["KarrasVeScheduler"]
+    _import_structure["scheduling_pndm"] = ["PNDMScheduler"]
+    _import_structure["scheduling_repaint"] = ["RePaintScheduler"]
+    _import_structure["scheduling_sde_ve"] = ["ScoreSdeVeScheduler"]
+    _import_structure["scheduling_sde_vp"] = ["ScoreSdeVpScheduler"]
+    _import_structure["scheduling_unclip"] = ["UnCLIPScheduler"]
+    _import_structure["scheduling_unipc_multistep"] = ["UniPCMultistepScheduler"]
+    _import_structure["scheduling_utils"] = ["KarrasDiffusionSchedulers", "SchedulerMixin"]
+    _import_structure["scheduling_vq_diffusion"] = ["VQDiffusionScheduler"]
+    _import_structure["scheduling_ddpm_wuerstchen"] = ["DDPMWuerstchenScheduler"]
 
 try:
     if not is_flax_available():
@@ -61,33 +73,59 @@
 except OptionalDependencyNotAvailable:
     from ..utils.dummy_flax_objects import *  # noqa F403
 else:
-    from .scheduling_ddim_flax import FlaxDDIMScheduler
-    from .scheduling_ddpm_flax import FlaxDDPMScheduler
-    from .scheduling_dpmsolver_multistep_flax import FlaxDPMSolverMultistepScheduler
-    from .scheduling_karras_ve_flax import FlaxKarrasVeScheduler
-    from .scheduling_lms_discrete_flax import FlaxLMSDiscreteScheduler
-    from .scheduling_pndm_flax import FlaxPNDMScheduler
-    from .scheduling_sde_ve_flax import FlaxScoreSdeVeScheduler
-    from .scheduling_utils_flax import (
-        FlaxKarrasDiffusionSchedulers,
-        FlaxSchedulerMixin,
-        FlaxSchedulerOutput,
-        broadcast_to_shape_from_left,
-    )
+    _import_structure["scheduling_ddim_flax"] = ["FlaxDDIMScheduler"]
+    _import_structure["scheduling_ddpm_flax"] = ["FlaxDDPMScheduler"]
+    _import_structure["scheduling_dpmsolver_multistep_flax"] = ["FlaxDPMSolverMultistepScheduler"]
+    _import_structure["scheduling_karras_ve_flax"] = ["FlaxKarrasVeScheduler"]
+    _import_structure["scheduling_lms_discrete_flax"] = ["FlaxLMSDiscreteScheduler"]
+    _import_structure["scheduling_pndm_flax"] = ["FlaxPNDMScheduler"]
+    _import_structure["scheduling_sde_ve_flax"] = ["FlaxScoreSdeVeScheduler"]
+    _import_structure["scheduling_utils_flax"] = [
+        "FlaxKarrasDiffusionSchedulers",
+        "FlaxSchedulerMixin",
+        "FlaxSchedulerOutput",
+        "broadcast_to_shape_from_left",
+    ]
 
 
 try:
     if not (is_torch_available() and is_scipy_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_torch_and_scipy_objects import *  # noqa F403
+    from ..utils import dummy_torch_and_scipy_objects  # noqa F403
+
+    modules = {}
+    for name in dir(dummy_torch_and_scipy_objects):
+        if (not name.endswith("Scheduler")) or name.startswith("_"):
+            continue
+        modules[name] = getattr(dummy_torch_and_scipy_objects, name)
+
+    _dummy_modules.update(modules)
+
 else:
-    from .scheduling_lms_discrete import LMSDiscreteScheduler
+    _import_structure["scheduling_lms_discrete"] = ["LMSDiscreteScheduler"]
 
 try:
     if not (is_torch_available() and is_torchsde_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_torch_and_torchsde_objects import *  # noqa F403
+    from ..utils import dummy_torch_and_torchsde_objects  # noqa F403
+
+    modules = {}
+    for name in dir(dummy_torch_and_torchsde_objects):
+        if (not name.endswith("Scheduler")) or name.startswith("_"):
+            continue
+        modules[name] = getattr(dummy_torch_and_torchsde_objects, name)
+
+    _dummy_modules.update(modules)
+
+
 else:
-    from .scheduling_dpmsolver_sde import DPMSolverSDEScheduler
+    _import_structure["scheduling_dpmsolver_sde"] = ["DPMSolverSDEScheduler"]
+
+import sys
+
+
+sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+for name, value in _dummy_modules.items():
+    setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py
index 735c6fc6cdd7..23cd3ec134b7 100644
--- a/src/diffusers/schedulers/scheduling_consistency_models.py
+++ b/src/diffusers/schedulers/scheduling_consistency_models.py
@@ -19,7 +19,8 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, logging, randn_tensor
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import SchedulerMixin
 
 
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 512e449edea3..aab5255abced 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -23,7 +23,8 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
 
 
diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py
index 0f1a9ebfcc43..f90a271dfc06 100644
--- a/src/diffusers/schedulers/scheduling_ddim_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py
@@ -23,7 +23,8 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
 
 
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index db4ede39e2e3..86f7e84ff07f 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -22,7 +22,8 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
 
 
diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
index 7e04001987f2..2f3bdd39aaa4 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
@@ -22,7 +22,8 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
 
 
diff --git a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
index 28311fc03301..781efb12b18b 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
@@ -22,7 +22,8 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import SchedulerMixin
 
 
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index 8c25cdff8a07..babba2206de0 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -21,7 +21,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import randn_tensor
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
index 34639d38a6a2..33a2637d00f3 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -21,7 +21,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import randn_tensor
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index a776be758189..41ef3a3f2732 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -20,7 +20,8 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, logging, randn_tensor
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
 
 
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 2cc36a1718d0..0875e1af3325 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -20,7 +20,8 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, logging, randn_tensor
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
 
 
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
index 0b2569a94f6c..b44ff31379ad 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -20,7 +20,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import randn_tensor
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
diff --git a/src/diffusers/schedulers/scheduling_karras_ve.py b/src/diffusers/schedulers/scheduling_karras_ve.py
index 1f8613cfe44a..462169b633de 100644
--- a/src/diffusers/schedulers/scheduling_karras_ve.py
+++ b/src/diffusers/schedulers/scheduling_karras_ve.py
@@ -20,7 +20,8 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import SchedulerMixin
 
 
diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py
index 941946efe914..733bd0a159fd 100644
--- a/src/diffusers/schedulers/scheduling_repaint.py
+++ b/src/diffusers/schedulers/scheduling_repaint.py
@@ -20,7 +20,8 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import SchedulerMixin
 
 
diff --git a/src/diffusers/schedulers/scheduling_sde_ve.py b/src/diffusers/schedulers/scheduling_sde_ve.py
index f1026de8f276..8b9439add3ec 100644
--- a/src/diffusers/schedulers/scheduling_sde_ve.py
+++ b/src/diffusers/schedulers/scheduling_sde_ve.py
@@ -21,7 +21,8 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import SchedulerMixin, SchedulerOutput
 
 
diff --git a/src/diffusers/schedulers/scheduling_sde_vp.py b/src/diffusers/schedulers/scheduling_sde_vp.py
index ff719adbbd28..b14bc867befa 100644
--- a/src/diffusers/schedulers/scheduling_sde_vp.py
+++ b/src/diffusers/schedulers/scheduling_sde_vp.py
@@ -20,7 +20,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import randn_tensor
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import SchedulerMixin
 
 
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index 844e552c0fb4..2f5b17815dd6 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -20,7 +20,8 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import SchedulerMixin
 
 
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 9b710d214d92..a846f6caef08 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -18,7 +18,6 @@
 from packaging import version
 
 from .. import __version__
-from .accelerate_utils import apply_forward_hook
 from .constants import (
     CONFIG_NAME,
     DEPRECATED_REVISION_ARGS,
@@ -35,6 +34,7 @@
 from .deprecation_utils import deprecate
 from .doc_utils import replace_example_docstring
 from .dynamic_modules_utils import get_class_from_dynamic_module
+from .export_utils import export_to_gif, export_to_obj, export_to_ply, export_to_video
 from .hub_utils import (
     HF_HUB_OFFLINE,
     PushToHubMixin,
@@ -52,6 +52,8 @@
     USE_TORCH,
     DummyObject,
     OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
     is_accelerate_available,
     is_accelerate_version,
     is_bs4_available,
@@ -78,32 +80,10 @@
     is_xformers_available,
     requires_backends,
 )
+from .loading_utils import load_image
 from .logging import get_logger
 from .outputs import BaseOutput
 from .pil_utils import PIL_INTERPOLATION, make_image_grid, numpy_to_pil, pt_to_pil
-from .torch_utils import is_compiled_module, randn_tensor
-
-
-if is_torch_available():
-    from .testing_utils import (
-        floats_tensor,
-        load_hf_numpy,
-        load_image,
-        load_numpy,
-        load_pt,
-        nightly,
-        parse_flag_from_env,
-        print_tensor_test,
-        require_torch_2,
-        require_torch_gpu,
-        skip_mps,
-        slow,
-        torch_all_close,
-        torch_device,
-    )
-    from .torch_utils import maybe_allow_in_graph
-
-from .testing_utils import export_to_gif, export_to_obj, export_to_ply, export_to_video
 
 
 logger = get_logger(__name__)
diff --git a/src/diffusers/utils/export_utils.py b/src/diffusers/utils/export_utils.py
new file mode 100644
index 000000000000..f7744f9d63eb
--- /dev/null
+++ b/src/diffusers/utils/export_utils.py
@@ -0,0 +1,132 @@
+import io
+import random
+import struct
+import tempfile
+from contextlib import contextmanager
+from typing import List
+
+import numpy as np
+import PIL.Image
+import PIL.ImageOps
+
+from .import_utils import (
+    BACKENDS_MAPPING,
+    is_opencv_available,
+)
+from .logging import get_logger
+
+
+global_rng = random.Random()
+
+logger = get_logger(__name__)
+
+
+@contextmanager
+def buffered_writer(raw_f):
+    f = io.BufferedWriter(raw_f)
+    yield f
+    f.flush()
+
+
+def export_to_gif(image: List[PIL.Image.Image], output_gif_path: str = None) -> str:
+    if output_gif_path is None:
+        output_gif_path = tempfile.NamedTemporaryFile(suffix=".gif").name
+
+    image[0].save(
+        output_gif_path,
+        save_all=True,
+        append_images=image[1:],
+        optimize=False,
+        duration=100,
+        loop=0,
+    )
+    return output_gif_path
+
+
+def export_to_ply(mesh, output_ply_path: str = None):
+    """
+    Write a PLY file for a mesh.
+    """
+    if output_ply_path is None:
+        output_ply_path = tempfile.NamedTemporaryFile(suffix=".ply").name
+
+    coords = mesh.verts.detach().cpu().numpy()
+    faces = mesh.faces.cpu().numpy()
+    rgb = np.stack([mesh.vertex_channels[x].detach().cpu().numpy() for x in "RGB"], axis=1)
+
+    with buffered_writer(open(output_ply_path, "wb")) as f:
+        f.write(b"ply\n")
+        f.write(b"format binary_little_endian 1.0\n")
+        f.write(bytes(f"element vertex {len(coords)}\n", "ascii"))
+        f.write(b"property float x\n")
+        f.write(b"property float y\n")
+        f.write(b"property float z\n")
+        if rgb is not None:
+            f.write(b"property uchar red\n")
+            f.write(b"property uchar green\n")
+            f.write(b"property uchar blue\n")
+        if faces is not None:
+            f.write(bytes(f"element face {len(faces)}\n", "ascii"))
+            f.write(b"property list uchar int vertex_index\n")
+        f.write(b"end_header\n")
+
+        if rgb is not None:
+            rgb = (rgb * 255.499).round().astype(int)
+            vertices = [
+                (*coord, *rgb)
+                for coord, rgb in zip(
+                    coords.tolist(),
+                    rgb.tolist(),
+                )
+            ]
+            format = struct.Struct("<3f3B")
+            for item in vertices:
+                f.write(format.pack(*item))
+        else:
+            format = struct.Struct("<3f")
+            for vertex in coords.tolist():
+                f.write(format.pack(*vertex))
+
+        if faces is not None:
+            format = struct.Struct("<B3I")
+            for tri in faces.tolist():
+                f.write(format.pack(len(tri), *tri))
+
+    return output_ply_path
+
+
+def export_to_obj(mesh, output_obj_path: str = None):
+    if output_obj_path is None:
+        output_obj_path = tempfile.NamedTemporaryFile(suffix=".obj").name
+
+    verts = mesh.verts.detach().cpu().numpy()
+    faces = mesh.faces.cpu().numpy()
+
+    vertex_colors = np.stack([mesh.vertex_channels[x].detach().cpu().numpy() for x in "RGB"], axis=1)
+    vertices = [
+        "{} {} {} {} {} {}".format(*coord, *color) for coord, color in zip(verts.tolist(), vertex_colors.tolist())
+    ]
+
+    faces = ["f {} {} {}".format(str(tri[0] + 1), str(tri[1] + 1), str(tri[2] + 1)) for tri in faces.tolist()]
+
+    combined_data = ["v " + vertex for vertex in vertices] + faces
+
+    with open(output_obj_path, "w") as f:
+        f.writelines("\n".join(combined_data))
+
+
+def export_to_video(video_frames: List[np.ndarray], output_video_path: str = None) -> str:
+    if is_opencv_available():
+        import cv2
+    else:
+        raise ImportError(BACKENDS_MAPPING["opencv"][1].format("export_to_video"))
+    if output_video_path is None:
+        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
+
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    h, w, c = video_frames[0].shape
+    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps=8, frameSize=(w, h))
+    for i in range(len(video_frames)):
+        img = cv2.cvtColor(video_frames[i], cv2.COLOR_RGB2BGR)
+        video_writer.write(img)
+    return output_video_path
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index 7fe5eacb25b0..1cf319e2941b 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -19,7 +19,9 @@
 import os
 import sys
 from collections import OrderedDict
-from typing import Union
+from itertools import chain
+from types import ModuleType
+from typing import Any, Union
 
 from huggingface_hub.utils import is_jinja_available  # noqa: F401
 from packaging import version
@@ -219,10 +221,10 @@
 try:
     _xformers_version = importlib_metadata.version("xformers")
     if _torch_available:
-        import torch
+        _torch_version = importlib_metadata.version("torch")
+        if version.Version(_torch_version) < version.Version("1.12"):
+            raise ValueError("xformers is installed in your environment and requires PyTorch >= 1.12")
 
-        if version.Version(torch.__version__) < version.Version("1.12"):
-            raise ValueError("PyTorch should be >= 1.12")
     logger.debug(f"Successfully imported xformers version {_xformers_version}")
 except importlib_metadata.PackageNotFoundError:
     _xformers_available = False
@@ -647,5 +649,85 @@ def is_k_diffusion_version(operation: str, version: str):
     return compare_versions(parse(_k_diffusion_version), operation, version)
 
 
+def get_objects_from_module(module):
+    """
+    Args:
+    Returns a dict of object names and values in a module, while skipping private/internal objects
+        module (ModuleType):
+            Module to extract the objects from.
+
+    Returns:
+        dict: Dictionary of object names and corresponding values
+    """
+
+    objects = {}
+    for name in dir(module):
+        if name.startswith("_"):
+            continue
+        objects[name] = getattr(module, name)
+
+    return objects
+
+
 class OptionalDependencyNotAvailable(BaseException):
     """An error indicating that an optional dependency of Diffusers was not found in the environment."""
+
+
+class _LazyModule(ModuleType):
+    """
+    Module class that surfaces all objects but only performs associated imports when the objects are requested.
+    """
+
+    # Very heavily inspired by optuna.integration._IntegrationModule
+    # https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
+    def __init__(self, name, module_file, import_structure, module_spec=None, extra_objects=None):
+        super().__init__(name)
+        self._modules = set(import_structure.keys())
+        self._class_to_module = {}
+        for key, values in import_structure.items():
+            for value in values:
+                self._class_to_module[value] = key
+        # Needed for autocompletion in an IDE
+        self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
+        self.__file__ = module_file
+        self.__spec__ = module_spec
+        self.__path__ = [os.path.dirname(module_file)]
+        self._objects = {} if extra_objects is None else extra_objects
+        self._name = name
+        self._import_structure = import_structure
+
+    # Needed for autocompletion in an IDE
+    def __dir__(self):
+        result = super().__dir__()
+        # The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether
+        # they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir.
+        for attr in self.__all__:
+            if attr not in result:
+                result.append(attr)
+        return result
+
+    def __getattr__(self, name: str) -> Any:
+        if name in self._objects:
+            return self._objects[name]
+        if name in self._modules:
+            value = self._get_module(name)
+        elif name in self._class_to_module.keys():
+            module = self._get_module(self._class_to_module[name])
+            value = getattr(module, name)
+        else:
+            raise AttributeError(f"module {self.__name__} has no attribute {name}")
+
+        setattr(self, name, value)
+        return value
+
+    def _get_module(self, module_name: str):
+        try:
+            return importlib.import_module("." + module_name, self.__name__)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its"
+                f" traceback):\n{e}"
+            ) from e
+
+    def __reduce__(self):
+        return (self.__class__, (self._name, self.__file__, self._import_structure))
diff --git a/src/diffusers/utils/loading_utils.py b/src/diffusers/utils/loading_utils.py
new file mode 100644
index 000000000000..279aa6fe737b
--- /dev/null
+++ b/src/diffusers/utils/loading_utils.py
@@ -0,0 +1,37 @@
+import os
+from typing import Union
+
+import PIL.Image
+import PIL.ImageOps
+import requests
+
+
+def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
+    """
+    Loads `image` to a PIL Image.
+
+    Args:
+        image (`str` or `PIL.Image.Image`):
+            The image to convert to the PIL Image format.
+    Returns:
+        `PIL.Image.Image`:
+            A PIL Image.
+    """
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            image = PIL.Image.open(requests.get(image, stream=True).raw)
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            raise ValueError(
+                f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
+            )
+    elif isinstance(image, PIL.Image.Image):
+        image = image
+    else:
+        raise ValueError(
+            "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
+        )
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
diff --git a/tests/models/test_layers_utils.py b/tests/models/test_layers_utils.py
index 40627cc93caa..9d45d810f653 100644
--- a/tests/models/test_layers_utils.py
+++ b/tests/models/test_layers_utils.py
@@ -25,7 +25,7 @@
 from diffusers.models.lora import LoRACompatibleLinear
 from diffusers.models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
 from diffusers.models.transformer_2d import Transformer2DModel
-from diffusers.utils import torch_device
+from diffusers.utils.testing_utils import torch_device
 
 
 class EmbeddingsTests(unittest.TestCase):
diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py
index c49ea7f2d960..1d846b6cdb3f 100644
--- a/tests/models/test_lora_layers.py
+++ b/tests/models/test_lora_layers.py
@@ -43,8 +43,7 @@
     LoRAAttnProcessor2_0,
     XFormersAttnProcessor,
 )
-from diffusers.utils import floats_tensor, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, slow
+from diffusers.utils.testing_utils import floats_tensor, require_torch_gpu, slow, torch_device
 
 
 def create_unet_lora_layers(unet: nn.Module):
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index d071bc3ccb60..921f67410032 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -30,12 +30,13 @@
 from diffusers.models import UNet2DConditionModel
 from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0, XFormersAttnProcessor
 from diffusers.training_utils import EMAModel
-from diffusers.utils import logging, torch_device
+from diffusers.utils import logging
 from diffusers.utils.testing_utils import (
     CaptureLogger,
     require_torch_2,
     require_torch_gpu,
     run_test_in_subprocess,
+    torch_device,
 )
 
 from ..others.test_utils import TOKEN, USER, is_staging_test
diff --git a/tests/models/test_models_prior.py b/tests/models/test_models_prior.py
index 25b9768ee34f..4c47a44ef52a 100644
--- a/tests/models/test_models_prior.py
+++ b/tests/models/test_models_prior.py
@@ -21,8 +21,7 @@
 from parameterized import parameterized
 
 from diffusers import PriorTransformer
-from diffusers.utils import floats_tensor, slow, torch_all_close, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, slow, torch_all_close, torch_device
 
 from .test_modeling_common import ModelTesterMixin
 
diff --git a/tests/models/test_models_unet_1d.py b/tests/models/test_models_unet_1d.py
index 1b58f9e616be..5803e5bfda2a 100644
--- a/tests/models/test_models_unet_1d.py
+++ b/tests/models/test_models_unet_1d.py
@@ -18,7 +18,7 @@
 import torch
 
 from diffusers import UNet1DModel
-from diffusers.utils import floats_tensor, slow, torch_device
+from diffusers.utils.testing_utils import floats_tensor, slow, torch_device
 
 from .test_modeling_common import ModelTesterMixin, UNetTesterMixin
 
diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py
index 5019c7eb2740..c5289a54b4bc 100644
--- a/tests/models/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -20,8 +20,14 @@
 import torch
 
 from diffusers import UNet2DModel
-from diffusers.utils import floats_tensor, logging, slow, torch_all_close, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism
+from diffusers.utils import logging
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    slow,
+    torch_all_close,
+    torch_device,
+)
 
 from .test_modeling_common import ModelTesterMixin, UNetTesterMixin
 
diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 85d6f48a1b95..f0f91a3a86a1 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -25,17 +25,17 @@
 
 from diffusers import UNet2DConditionModel
 from diffusers.models.attention_processor import CustomDiffusionAttnProcessor, LoRAAttnProcessor
-from diffusers.utils import (
+from diffusers.utils import logging
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
     floats_tensor,
     load_hf_numpy,
-    logging,
     require_torch_gpu,
     slow,
     torch_all_close,
     torch_device,
 )
-from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism
 
 from .test_modeling_common import ModelTesterMixin, UNetTesterMixin
 
diff --git a/tests/models/test_models_unet_3d_condition.py b/tests/models/test_models_unet_3d_condition.py
index ed42c582e889..f0d6a8d72571 100644
--- a/tests/models/test_models_unet_3d_condition.py
+++ b/tests/models/test_models_unet_3d_condition.py
@@ -22,14 +22,9 @@
 
 from diffusers.models import ModelMixin, UNet3DConditionModel
 from diffusers.models.attention_processor import AttnProcessor, LoRAAttnProcessor
-from diffusers.utils import (
-    floats_tensor,
-    logging,
-    skip_mps,
-    torch_device,
-)
+from diffusers.utils import logging
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, skip_mps, torch_device
 
 from .test_modeling_common import ModelTesterMixin, UNetTesterMixin
 
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index fe38b4fc216d..fe2bcdb0af35 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -20,9 +20,16 @@
 from parameterized import parameterized
 
 from diffusers import AsymmetricAutoencoderKL, AutoencoderKL, AutoencoderTiny
-from diffusers.utils import floats_tensor, load_hf_numpy, require_torch_gpu, slow, torch_all_close, torch_device
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_hf_numpy,
+    require_torch_gpu,
+    slow,
+    torch_all_close,
+    torch_device,
+)
 
 from .test_modeling_common import ModelTesterMixin, UNetTesterMixin
 
diff --git a/tests/models/test_models_vq.py b/tests/models/test_models_vq.py
index 5706c13a0c45..c7b9363b5d5f 100644
--- a/tests/models/test_models_vq.py
+++ b/tests/models/test_models_vq.py
@@ -18,8 +18,7 @@
 import torch
 
 from diffusers import VQModel
-from diffusers.utils import floats_tensor, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, torch_device
 
 from .test_modeling_common import ModelTesterMixin, UNetTesterMixin
 
diff --git a/tests/models/test_unet_2d_blocks.py b/tests/models/test_unet_2d_blocks.py
index 4d658f282932..d714b9384860 100644
--- a/tests/models/test_unet_2d_blocks.py
+++ b/tests/models/test_unet_2d_blocks.py
@@ -15,7 +15,7 @@
 import unittest
 
 from diffusers.models.unet_2d_blocks import *  # noqa F403
-from diffusers.utils import torch_device
+from diffusers.utils.testing_utils import torch_device
 
 from .test_unet_blocks_common import UNetBlockTesterMixin
 
diff --git a/tests/models/test_unet_blocks_common.py b/tests/models/test_unet_blocks_common.py
index 17b7f65d6da3..4c399fdb74fa 100644
--- a/tests/models/test_unet_blocks_common.py
+++ b/tests/models/test_unet_blocks_common.py
@@ -17,8 +17,8 @@
 
 import torch
 
-from diffusers.utils import floats_tensor, randn_tensor, torch_all_close, torch_device
-from diffusers.utils.testing_utils import require_torch
+from diffusers.utils.testing_utils import floats_tensor, require_torch, torch_all_close, torch_device
+from diffusers.utils.torch_utils import randn_tensor
 
 
 @require_torch
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py
index 81ec00940c12..da5eb34fe92f 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py
@@ -25,8 +25,7 @@
     RobertaSeriesConfig,
     RobertaSeriesModelWithTransformation,
 )
-from diffusers.utils import nightly, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
index 9bef75f4fff5..57001f7bea52 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
@@ -32,8 +32,15 @@
     RobertaSeriesConfig,
     RobertaSeriesModelWithTransformation,
 )
-from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_numpy,
+    nightly,
+    require_torch_gpu,
+    torch_device,
+)
 
 
 enable_full_determinism()
diff --git a/tests/pipelines/audio_diffusion/test_audio_diffusion.py b/tests/pipelines/audio_diffusion/test_audio_diffusion.py
index d2b110adb00d..271e458bf565 100644
--- a/tests/pipelines/audio_diffusion/test_audio_diffusion.py
+++ b/tests/pipelines/audio_diffusion/test_audio_diffusion.py
@@ -29,8 +29,7 @@
     UNet2DConditionModel,
     UNet2DModel,
 )
-from diffusers.utils import nightly, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, slow, torch_device
 
 
 enable_full_determinism()
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 0165d3f5edda..516cea76b742 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -36,8 +36,8 @@
     PNDMScheduler,
     UNet2DConditionModel,
 )
-from diffusers.utils import is_xformers_available, nightly, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, slow, torch_device
 
 from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py
index 942aec70d7cb..b37fe4dcec48 100644
--- a/tests/pipelines/audioldm2/test_audioldm2.py
+++ b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -44,8 +44,8 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from diffusers.utils import is_accelerate_available, is_accelerate_version, is_xformers_available, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism
+from diffusers.utils import is_accelerate_available, is_accelerate_version, is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism, slow, torch_device
 
 from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
diff --git a/tests/pipelines/consistency_models/test_consistency_models.py b/tests/pipelines/consistency_models/test_consistency_models.py
index dfb19755d879..6732d5228d50 100644
--- a/tests/pipelines/consistency_models/test_consistency_models.py
+++ b/tests/pipelines/consistency_models/test_consistency_models.py
@@ -10,8 +10,14 @@
     ConsistencyModelPipeline,
     UNet2DModel,
 )
-from diffusers.utils import nightly, randn_tensor, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_2, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    nightly,
+    require_torch_2,
+    require_torch_gpu,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
 
 from ..pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py
index 62f011cce59e..3ede0f2c4271 100644
--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -31,14 +31,18 @@
     UNet2DConditionModel,
 )
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
-from diffusers.utils import load_image, load_numpy, randn_tensor, slow, torch_device
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
+    load_image,
+    load_numpy,
     require_torch_2,
     require_torch_gpu,
     run_test_in_subprocess,
+    slow,
+    torch_device,
 )
+from diffusers.utils.torch_utils import randn_tensor
 
 from ..pipeline_params import (
     IMAGE_TO_IMAGE_IMAGE_PARAMS,
diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py
index 4ba1b9a09ebe..209f6d23387e 100644
--- a/tests/pipelines/controlnet/test_controlnet_img2img.py
+++ b/tests/pipelines/controlnet/test_controlnet_img2img.py
@@ -33,9 +33,17 @@
     UNet2DConditionModel,
 )
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
-from diffusers.utils import floats_tensor, load_image, load_numpy, randn_tensor, slow, torch_device
+from diffusers.utils import load_image
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
 
 from ..pipeline_params import (
     IMAGE_TO_IMAGE_IMAGE_PARAMS,
diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py
index 07519595c49e..abaa6d37b922 100644
--- a/tests/pipelines/controlnet/test_controlnet_inpaint.py
+++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py
@@ -33,9 +33,17 @@
     UNet2DConditionModel,
 )
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
-from diffusers.utils import floats_tensor, load_image, load_numpy, randn_tensor, slow, torch_device
+from diffusers.utils import load_image
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
 
 from ..pipeline_params import (
     TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py
index 8dbfb95d0960..81c789e71260 100644
--- a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py
+++ b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py
@@ -28,9 +28,8 @@
     StableDiffusionXLControlNetInpaintPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import floats_tensor, torch_device
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, torch_device
 
 from ..pipeline_params import (
     IMAGE_TO_IMAGE_IMAGE_PARAMS,
diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py
index 8fb76499dc14..264b879e44be 100644
--- a/tests/pipelines/controlnet/test_controlnet_sdxl.py
+++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py
@@ -28,9 +28,9 @@
     UNet2DConditionModel,
 )
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
-from diffusers.utils import load_image, randn_tensor, torch_device
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow
+from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, slow, torch_device
+from diffusers.utils.torch_utils import randn_tensor
 
 from ..pipeline_params import (
     IMAGE_TO_IMAGE_IMAGE_PARAMS,
diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py b/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py
index 1028e4cb2b61..ee8c479b1894 100644
--- a/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py
+++ b/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py
@@ -27,9 +27,8 @@
     StableDiffusionXLControlNetImg2ImgPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import floats_tensor, torch_device
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, torch_device
 
 from ..pipeline_params import (
     IMAGE_TO_IMAGE_IMAGE_PARAMS,
diff --git a/tests/pipelines/controlnet/test_flax_controlnet.py b/tests/pipelines/controlnet/test_flax_controlnet.py
index 4ad75b407acc..e4d131195d6a 100644
--- a/tests/pipelines/controlnet/test_flax_controlnet.py
+++ b/tests/pipelines/controlnet/test_flax_controlnet.py
@@ -17,8 +17,8 @@
 import unittest
 
 from diffusers import FlaxControlNetModel, FlaxStableDiffusionControlNetPipeline
-from diffusers.utils import is_flax_available, load_image, slow
-from diffusers.utils.testing_utils import require_flax
+from diffusers.utils import is_flax_available, load_image
+from diffusers.utils.testing_utils import require_flax, slow
 
 
 if is_flax_available():
diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
index b517b02bbabf..fa10f29ee1f6 100644
--- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
@@ -20,8 +20,7 @@
 import torch
 
 from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
-from diffusers.utils import nightly, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, skip_mps, torch_device
 
 from ..pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img.py b/tests/pipelines/deepfloyd_if/test_if_img2img.py
index ec4598906a6f..bfb70c5c9b98 100644
--- a/tests/pipelines/deepfloyd_if/test_if_img2img.py
+++ b/tests/pipelines/deepfloyd_if/test_if_img2img.py
@@ -19,9 +19,8 @@
 import torch
 
 from diffusers import IFImg2ImgPipeline
-from diffusers.utils import floats_tensor
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import skip_mps, torch_device
+from diffusers.utils.testing_utils import floats_tensor, skip_mps, torch_device
 
 from ..pipeline_params import (
     TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
index 500557108aed..f35f3e945609 100644
--- a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
+++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
@@ -19,9 +19,8 @@
 import torch
 
 from diffusers import IFImg2ImgSuperResolutionPipeline
-from diffusers.utils import floats_tensor
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import skip_mps, torch_device
+from diffusers.utils.testing_utils import floats_tensor, skip_mps, torch_device
 
 from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/tests/pipelines/deepfloyd_if/test_if_inpainting.py
index 1317fcb64e81..68753c0ac1cd 100644
--- a/tests/pipelines/deepfloyd_if/test_if_inpainting.py
+++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py
@@ -19,9 +19,8 @@
 import torch
 
 from diffusers import IFInpaintingPipeline
-from diffusers.utils import floats_tensor
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import skip_mps, torch_device
+from diffusers.utils.testing_utils import floats_tensor, skip_mps, torch_device
 
 from ..pipeline_params import (
     TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
index 961a22675f33..03b92e0d783c 100644
--- a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
+++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
@@ -19,9 +19,8 @@
 import torch
 
 from diffusers import IFInpaintingSuperResolutionPipeline
-from diffusers.utils import floats_tensor
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import skip_mps, torch_device
+from diffusers.utils.testing_utils import floats_tensor, skip_mps, torch_device
 
 from ..pipeline_params import (
     TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
diff --git a/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_superresolution.py
index 52fb38308892..5a74148e6661 100644
--- a/tests/pipelines/deepfloyd_if/test_if_superresolution.py
+++ b/tests/pipelines/deepfloyd_if/test_if_superresolution.py
@@ -19,9 +19,8 @@
 import torch
 
 from diffusers import IFSuperResolutionPipeline
-from diffusers.utils import floats_tensor
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import skip_mps, torch_device
+from diffusers.utils.testing_utils import floats_tensor, skip_mps, torch_device
 
 from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
diff --git a/tests/pipelines/dit/test_dit.py b/tests/pipelines/dit/test_dit.py
index 2f91473b070b..8f4d11ec3838 100644
--- a/tests/pipelines/dit/test_dit.py
+++ b/tests/pipelines/dit/test_dit.py
@@ -20,8 +20,8 @@
 import torch
 
 from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, DPMSolverMultistepScheduler, Transformer2DModel
-from diffusers.utils import is_xformers_available, load_numpy, nightly, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, nightly, require_torch_gpu, torch_device
 
 from ..pipeline_params import (
     CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 01b8a0f3eec1..dd0cc75d629a 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -23,8 +23,14 @@
 
 from diffusers import DDIMScheduler, KandinskyPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
-from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
diff --git a/tests/pipelines/kandinsky/test_kandinsky_combined.py b/tests/pipelines/kandinsky/test_kandinsky_combined.py
index 7629407ab745..d2079d67b60e 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py
@@ -18,8 +18,7 @@
 import numpy as np
 
 from diffusers import KandinskyCombinedPipeline, KandinskyImg2ImgCombinedPipeline, KandinskyInpaintCombinedPipeline
-from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
 
 from ..test_pipelines_common import PipelineTesterMixin
 from .test_kandinsky import Dummies
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index f309dec89370..d91f779d2221 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -31,8 +31,16 @@
     VQModel,
 )
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
-from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index 7f1841d60807..73c4eadadd96 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -24,8 +24,15 @@
 
 from diffusers import DDIMScheduler, KandinskyInpaintPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py
index 7b1acc9fc03e..b9f78ee0e8af 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py
@@ -28,8 +28,7 @@
 )
 
 from diffusers import KandinskyPriorPipeline, PriorTransformer, UnCLIPScheduler
-from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
 
 from ..test_pipelines_common import PipelineTesterMixin
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky.py b/tests/pipelines/kandinsky_v22/test_kandinsky.py
index 6430a476ab98..4f18990c2c0a 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky.py
@@ -21,8 +21,14 @@
 import torch
 
 from diffusers import DDIMScheduler, KandinskyV22Pipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel
-from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_combined.py b/tests/pipelines/kandinsky_v22/test_kandinsky_combined.py
index 7591b2347a92..ba8888ee1fa6 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_combined.py
@@ -22,8 +22,7 @@
     KandinskyV22Img2ImgCombinedPipeline,
     KandinskyV22InpaintCombinedPipeline,
 )
-from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
 
 from ..test_pipelines_common import PipelineTesterMixin
 from .test_kandinsky import Dummies
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
index a50bdb50a47b..575d0aaaa767 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py
@@ -27,8 +27,15 @@
     UNet2DConditionModel,
     VQModel,
 )
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
index 9d0ac96888ec..17394316ce7a 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py
@@ -28,8 +28,15 @@
     UNet2DConditionModel,
     VQModel,
 )
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
index 17f27d0d7804..1454b061bc90 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
@@ -28,8 +28,15 @@
     UNet2DConditionModel,
     VQModel,
 )
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
index 436c240e1ac8..d7fcf670278d 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
@@ -28,8 +28,15 @@
     UNet2DConditionModel,
     VQModel,
 )
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py b/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py
index 3191f6a11309..317e822a465a 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py
@@ -28,8 +28,7 @@
 )
 
 from diffusers import KandinskyV22PriorPipeline, PriorTransformer, UnCLIPScheduler
-from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
 
 from ..test_pipelines_common import PipelineTesterMixin
 
diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py b/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py
index 75d101e9c10d..f71cbfcd0b5c 100644
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py
@@ -30,8 +30,7 @@
 )
 
 from diffusers import KandinskyV22PriorEmb2EmbPipeline, PriorTransformer, UnCLIPScheduler
-from diffusers.utils import floats_tensor, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, skip_mps, torch_device
 
 from ..test_pipelines_common import PipelineTesterMixin
 
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
index d21ead543af8..c26a8b407b67 100644
--- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
@@ -20,8 +20,15 @@
 import torch
 
 from diffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel
-from diffusers.utils import PIL_INTERPOLATION, floats_tensor, load_image, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch
+from diffusers.utils import PIL_INTERPOLATION
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    require_torch,
+    slow,
+    torch_device,
+)
 
 
 enable_full_determinism()
diff --git a/tests/pipelines/musicldm/test_musicldm.py b/tests/pipelines/musicldm/test_musicldm.py
index 4874bf16942d..ea4c52aee1eb 100644
--- a/tests/pipelines/musicldm/test_musicldm.py
+++ b/tests/pipelines/musicldm/test_musicldm.py
@@ -38,8 +38,8 @@
     PNDMScheduler,
     UNet2DConditionModel,
 )
-from diffusers.utils import is_xformers_available, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
 
 from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
diff --git a/tests/pipelines/paint_by_example/test_paint_by_example.py b/tests/pipelines/paint_by_example/test_paint_by_example.py
index 8b5b50b9f819..3148f9483124 100644
--- a/tests/pipelines/paint_by_example/test_paint_by_example.py
+++ b/tests/pipelines/paint_by_example/test_paint_by_example.py
@@ -24,8 +24,14 @@
 
 from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel
 from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
-from diffusers.utils import floats_tensor, load_image, nightly, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    nightly,
+    require_torch_gpu,
+    torch_device,
+)
 
 from ..pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
diff --git a/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py b/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
index 9e810616dc56..a09d0df79094 100644
--- a/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
+++ b/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
@@ -24,8 +24,13 @@
 
 from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
 from diffusers.pipelines.semantic_stable_diffusion import SemanticStableDiffusionPipeline as StableDiffusionPipeline
-from diffusers.utils import floats_tensor, nightly, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    nightly,
+    require_torch_gpu,
+    torch_device,
+)
 
 
 enable_full_determinism()
diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 90ff37de6e9a..f3c782c14bb2 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -21,8 +21,7 @@
 
 from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEPipeline
 from diffusers.pipelines.shap_e import ShapERenderer
-from diffusers.utils import load_numpy, slow
-from diffusers.utils.testing_utils import require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import load_numpy, require_torch_gpu, slow, torch_device
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index 0dffac98aa25..44597e2fe49a 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -22,8 +22,7 @@
 
 from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEImg2ImgPipeline
 from diffusers.pipelines.shap_e import ShapERenderer
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow
-from diffusers.utils.testing_utils import require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import floats_tensor, load_image, load_numpy, require_torch_gpu, slow, torch_device
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index e70b377e2fe0..1d00c7e963bb 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -21,8 +21,15 @@
 
 from diffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline
 from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
-from diffusers.utils import nightly, require_torch_gpu, skip_mps, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_note_seq, require_onnxruntime
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    nightly,
+    require_note_seq,
+    require_onnxruntime,
+    require_torch_gpu,
+    skip_mps,
+    torch_device,
+)
 
 from ..pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
index 9a54c21c0a21..27a5da556021 100644
--- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
@@ -22,8 +22,16 @@
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
 from diffusers import AutoencoderKL, CycleDiffusionPipeline, DDIMScheduler, UNet2DConditionModel
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_device,
+)
 
 from ..pipeline_params import (
     IMAGE_TO_IMAGE_IMAGE_PARAMS,
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
index 9147dc461fc5..d7d549b7b5c2 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
@@ -26,8 +26,8 @@
     OnnxStableDiffusionImg2ImgPipeline,
     PNDMScheduler,
 )
-from diffusers.utils import floats_tensor
 from diffusers.utils.testing_utils import (
+    floats_tensor,
     is_onnx_available,
     load_image,
     nightly,
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py
index c65030406465..56c10adbd6ae 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py
@@ -26,8 +26,8 @@
     OnnxStableDiffusionUpscalePipeline,
     PNDMScheduler,
 )
-from diffusers.utils import floats_tensor
 from diffusers.utils.testing_utils import (
+    floats_tensor,
     is_onnx_available,
     load_image,
     nightly,
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 31de557a0ac3..e67bfd661cc1 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -38,14 +38,17 @@
     logging,
 )
 from diffusers.models.attention_processor import AttnProcessor, LoRAXFormersAttnProcessor
-from diffusers.utils import load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import (
     CaptureLogger,
     enable_full_determinism,
+    load_numpy,
+    nightly,
     numpy_cosine_similarity_distance,
     require_torch_2,
     require_torch_gpu,
     run_test_in_subprocess,
+    slow,
+    torch_device,
 )
 
 from ...models.test_lora_layers import create_unet_lora_layers
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
index 5778e862a3b0..c0ef4ceae92c 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
@@ -30,9 +30,17 @@
     T2IAdapter,
     UNet2DConditionModel,
 )
-from diffusers.utils import floats_tensor, load_image, load_numpy, logging, slow, torch_device
+from diffusers.utils import logging
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_gligen_text_image.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_gligen_text_image.py
index e2b4f59dd103..4e14adc81f42 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_gligen_text_image.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_gligen_text_image.py
@@ -28,11 +28,11 @@
 
 from diffusers import (
     AutoencoderKL,
-    CLIPImageProjection,
     DDIMScheduler,
     StableDiffusionGLIGENTextImagePipeline,
     UNet2DConditionModel,
 )
+from diffusers.pipelines.stable_diffusion import CLIPImageProjection
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import enable_full_determinism
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
index 580c78675a92..b6d6c7b80c98 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
@@ -29,8 +29,16 @@
     StableDiffusionImageVariationPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
 from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 043825c2f75d..cf22fccd8232 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -32,13 +32,18 @@
     StableDiffusionImg2ImgPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
     require_torch_2,
     require_torch_gpu,
     run_test_in_subprocess,
     skip_mps,
+    slow,
+    torch_device,
 )
 
 from ..pipeline_params import (
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 4d75992b74a8..21e8c05ac28f 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -36,12 +36,17 @@
 )
 from diffusers.models.attention_processor import AttnProcessor
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image
-from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
     require_torch_2,
     require_torch_gpu,
     run_test_in_subprocess,
+    slow,
+    torch_device,
 )
 
 from ...models.test_models_unet_2d_condition import create_lora_layers
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index fa00a0d201af..45563cdb798b 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -33,8 +33,17 @@
     UNet2DModel,
     VQModel,
 )
-from diffusers.utils import floats_tensor, load_image, nightly, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, preprocess_image, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    nightly,
+    preprocess_image,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 
 enable_full_determinism()
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
index 513e11c105d5..07fd8e1b5192 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
@@ -32,8 +32,14 @@
     UNet2DConditionModel,
 )
 from diffusers.image_processor import VaeImageProcessor
-from diffusers.utils import floats_tensor, load_image, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..pipeline_params import (
     IMAGE_TO_IMAGE_IMAGE_PARAMS,
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py
index 25da13d9f922..672c0ebfa0d8 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py
@@ -20,8 +20,7 @@
 import torch
 
 from diffusers import StableDiffusionKDiffusionPipeline
-from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device
 
 
 enable_full_determinism()
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py
index e2164e8117ad..b812f1d3c257 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py
@@ -28,8 +28,7 @@
     StableDiffusionLDM3DPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import nightly, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, slow, torch_device
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
index 81d1baed5df6..b7ddd2fd59f8 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
@@ -28,8 +28,7 @@
     StableDiffusionModelEditingPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps, slow, torch_device
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
index a10e74742c4d..657608df8b98 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
@@ -29,8 +29,7 @@
     StableDiffusionPanoramaPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import nightly, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, skip_mps, torch_device
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_paradigms.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_paradigms.py
index 781cbcbd69a1..3ce476d09be9 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_paradigms.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_paradigms.py
@@ -27,10 +27,11 @@
     StableDiffusionParadigmsPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import slow, torch_device
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
     require_torch_gpu,
+    slow,
+    torch_device,
 )
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
index c513fb1c0b33..54b82f2f2487 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
@@ -33,8 +33,17 @@
     UNet2DConditionModel,
 )
 from diffusers.image_processor import VaeImageProcessor
-from diffusers.utils import floats_tensor, load_numpy, nightly, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, load_image, load_pt, require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    load_pt,
+    nightly,
+    require_torch_gpu,
+    skip_mps,
+    torch_device,
+)
 
 from ..pipeline_params import (
     TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
index 79d76666c392..b87d11e85876 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
@@ -26,8 +26,7 @@
     StableDiffusionSAGPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
index 3991366966c3..3842dda2e551 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -32,12 +32,15 @@
     UNet2DConditionModel,
     logging,
 )
-from diffusers.utils import load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import (
     CaptureLogger,
     enable_full_determinism,
+    load_numpy,
+    nightly,
     numpy_cosine_similarity_distance,
     require_torch_gpu,
+    slow,
+    torch_device,
 )
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
index 3e280058b1f7..fcd6ff8d77f3 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
@@ -26,8 +26,13 @@
     StableDiffusionAttendAndExcitePipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import load_numpy, skip_mps, slow
-from diffusers.utils.testing_utils import numpy_cosine_similarity_distance, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    load_numpy,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+)
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
index 236bec5bac38..149c90698f1c 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
@@ -39,17 +39,18 @@
     StableDiffusionDepth2ImgPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import (
+from diffusers.utils import is_accelerate_available, is_accelerate_version
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
     floats_tensor,
-    is_accelerate_available,
-    is_accelerate_version,
     load_image,
     load_numpy,
     nightly,
+    require_torch_gpu,
+    skip_mps,
     slow,
     torch_device,
 )
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
 
 from ..pipeline_params import (
     IMAGE_TO_IMAGE_IMAGE_PARAMS,
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
index 8be8f276df25..c4cfaee9cf31 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py
@@ -32,8 +32,15 @@
     StableDiffusionDiffEditPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import load_image, nightly, slow
-from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    nightly,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
 from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax.py
index fa93da9052f7..358d137b5781 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax.py
@@ -17,8 +17,8 @@
 import unittest
 
 from diffusers import FlaxDPMSolverMultistepScheduler, FlaxStableDiffusionPipeline
-from diffusers.utils import is_flax_available, nightly, slow
-from diffusers.utils.testing_utils import require_flax
+from diffusers.utils import is_flax_available
+from diffusers.utils.testing_utils import nightly, require_flax, slow
 
 
 if is_flax_available():
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax_inpaint.py
index 432619a79ddd..3d9e6c0dc5e1 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax_inpaint.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax_inpaint.py
@@ -17,8 +17,8 @@
 import unittest
 
 from diffusers import FlaxStableDiffusionInpaintPipeline
-from diffusers.utils import is_flax_available, load_image, slow
-from diffusers.utils.testing_utils import require_flax
+from diffusers.utils import is_flax_available, load_image
+from diffusers.utils.testing_utils import require_flax, slow
 
 
 if is_flax_available():
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
index 68a4b5132375..1e726b95960f 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
@@ -23,8 +23,15 @@
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
 from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel
-from diffusers.utils import floats_tensor, load_image, load_numpy, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
 from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
index ce55bddc4fe0..e20438a2af6b 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
@@ -30,8 +30,15 @@
     UNet2DConditionModel,
 )
 from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
 from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
index ab7eb2e0fd99..2c0f37519ad8 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
@@ -24,8 +24,15 @@
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
 from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableDiffusionUpscalePipeline, UNet2DConditionModel
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 
 enable_full_determinism()
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
index a8c857d75532..6062f5edb80b 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
@@ -31,8 +31,14 @@
     UNet2DConditionModel,
 )
 from diffusers.models.attention_processor import AttnProcessor
-from diffusers.utils import load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, numpy_cosine_similarity_distance, require_torch_gpu
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_numpy,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 
 enable_full_determinism()
diff --git a/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
index 09e31aacfbc9..ce57ccadd4f8 100644
--- a/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
+++ b/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
@@ -24,8 +24,7 @@
 
 from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion_safe import StableDiffusionPipelineSafe as StableDiffusionPipeline
-from diffusers.utils import floats_tensor, nightly, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import floats_tensor, nightly, require_torch_gpu, torch_device
 
 
 class SafeDiffusionPipelineFastTests(unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
index 909f759ff745..dad52238f73a 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
@@ -32,8 +32,7 @@
     UNet2DConditionModel,
     UniPCMultistepScheduler,
 )
-from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
index afe7da3319c7..e71f103005a3 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
@@ -27,8 +27,7 @@
     T2IAdapter,
     UNet2DConditionModel,
 )
-from diffusers.utils import floats_tensor
-from diffusers.utils.testing_utils import enable_full_determinism
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor
 
 from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
index 04cbb09f5196..b372971dedba 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
@@ -26,8 +26,7 @@
     StableDiffusionXLImg2ImgPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import floats_tensor, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, torch_device
 
 from ..pipeline_params import (
     IMAGE_TO_IMAGE_IMAGE_PARAMS,
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
index dd8f8c18b09c..5d0def014ff5 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
@@ -32,8 +32,7 @@
     UNet2DConditionModel,
     UniPCMultistepScheduler,
 )
-from diffusers.utils import floats_tensor, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, torch_device
 
 from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
 from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_instruction_pix2pix.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_instruction_pix2pix.py
index 2608886ded98..ca4017d11b79 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_instruction_pix2pix.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_instruction_pix2pix.py
@@ -29,8 +29,7 @@
 from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_instruct_pix2pix import (
     StableDiffusionXLInstructPix2PixPipeline,
 )
-from diffusers.utils import floats_tensor, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, torch_device
 
 from ..pipeline_params import (
     IMAGE_TO_IMAGE_IMAGE_PARAMS,
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index 02fead022e89..927c5ec28518 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -62,24 +62,24 @@
 from diffusers.utils import (
     CONFIG_NAME,
     WEIGHTS_NAME,
-    floats_tensor,
-    is_compiled_module,
-    nightly,
-    require_torch_2,
-    slow,
-    torch_device,
 )
 from diffusers.utils.testing_utils import (
     CaptureLogger,
     enable_full_determinism,
+    floats_tensor,
     get_tests_dir,
     load_numpy,
+    nightly,
     require_compel,
     require_flax,
     require_onnxruntime,
+    require_torch_2,
     require_torch_gpu,
     run_test_in_subprocess,
+    slow,
+    torch_device,
 )
+from diffusers.utils.torch_utils import is_compiled_module
 
 
 enable_full_determinism()
diff --git a/tests/pipelines/test_pipelines_auto.py b/tests/pipelines/test_pipelines_auto.py
index e48a99c01e7d..bfdedd25babe 100644
--- a/tests/pipelines/test_pipelines_auto.py
+++ b/tests/pipelines/test_pipelines_auto.py
@@ -34,7 +34,7 @@
     AUTO_INPAINT_PIPELINES_MAPPING,
     AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
 )
-from diffusers.utils import slow
+from diffusers.utils.testing_utils import slow
 
 
 PRETRAINED_MODEL_REPO_MAPPING = OrderedDict(
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index a6f828443cb0..c70ccc635780 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -455,12 +455,13 @@ def _test_inference_batch_single_identical(
             # TODO same as above
             test_mean_pixel_difference = torch_device != "mps"
 
+        generator_device = "cpu"
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self.get_dummy_inputs(generator_device)
 
         logger = logging.get_logger(pipe.__module__)
         logger.setLevel(level=diffusers.logging.FATAL)
@@ -624,7 +625,8 @@ def test_save_load_optional_components(self, expected_max_difference=1e-4):
         for optional_component in pipe._optional_components:
             setattr(pipe, optional_component, None)
 
-        inputs = self.get_dummy_inputs(torch_device)
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
         output = pipe(**inputs)[0]
 
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -642,7 +644,7 @@ def test_save_load_optional_components(self, expected_max_difference=1e-4):
                 f"`{optional_component}` did not stay set to None after loading.",
             )
 
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self.get_dummy_inputs(generator_device)
         output_loaded = pipe_loaded(**inputs)[0]
 
         max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
diff --git a/tests/pipelines/text_to_video/test_text_to_video.py b/tests/pipelines/text_to_video/test_text_to_video.py
index 801af7f6b4e6..e03c8fc5dfb6 100644
--- a/tests/pipelines/text_to_video/test_text_to_video.py
+++ b/tests/pipelines/text_to_video/test_text_to_video.py
@@ -25,8 +25,15 @@
     TextToVideoSDPipeline,
     UNet3DConditionModel,
 )
-from diffusers.utils import is_xformers_available, load_numpy, require_torch_gpu, skip_mps, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_numpy,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_device,
+)
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
diff --git a/tests/pipelines/text_to_video/test_text_to_video_zero.py b/tests/pipelines/text_to_video/test_text_to_video_zero.py
index 8fc7254c52d1..02fb43a0b65b 100644
--- a/tests/pipelines/text_to_video/test_text_to_video_zero.py
+++ b/tests/pipelines/text_to_video/test_text_to_video_zero.py
@@ -18,7 +18,7 @@
 import torch
 
 from diffusers import DDIMScheduler, TextToVideoZeroPipeline
-from diffusers.utils import load_pt, require_torch_gpu, slow
+from diffusers.utils.testing_utils import load_pt, require_torch_gpu, slow
 
 from ..test_pipelines_common import assert_mean_pixel_difference
 
diff --git a/tests/pipelines/text_to_video/test_video_to_video.py b/tests/pipelines/text_to_video/test_video_to_video.py
index 9e61ddcbbd3f..6b1c44ceb057 100644
--- a/tests/pipelines/text_to_video/test_video_to_video.py
+++ b/tests/pipelines/text_to_video/test_video_to_video.py
@@ -26,8 +26,14 @@
     UNet3DConditionModel,
     VideoToVideoSDPipeline,
 )
-from diffusers.utils import floats_tensor, is_xformers_available, skip_mps
-from diffusers.utils.testing_utils import enable_full_determinism, slow, torch_device
+from diffusers.utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    skip_mps,
+    slow,
+    torch_device,
+)
 
 from ..pipeline_params import (
     TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
diff --git a/tests/pipelines/unclip/test_unclip.py b/tests/pipelines/unclip/test_unclip.py
index 46890904a3c6..111a8b918457 100644
--- a/tests/pipelines/unclip/test_unclip.py
+++ b/tests/pipelines/unclip/test_unclip.py
@@ -22,8 +22,15 @@
 
 from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel
 from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
-from diffusers.utils import load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_numpy,
+    nightly,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_device,
+)
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
diff --git a/tests/pipelines/unclip/test_unclip_image_variation.py b/tests/pipelines/unclip/test_unclip_image_variation.py
index 2604368104a3..6b4e2b0fc0b4 100644
--- a/tests/pipelines/unclip/test_unclip_image_variation.py
+++ b/tests/pipelines/unclip/test_unclip_image_variation.py
@@ -36,8 +36,16 @@
     UNet2DModel,
 )
 from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
-from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    load_image,
+    load_numpy,
+    require_torch_gpu,
+    skip_mps,
+    slow,
+    torch_device,
+)
 
 from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
diff --git a/tests/pipelines/unidiffuser/test_unidiffuser.py b/tests/pipelines/unidiffuser/test_unidiffuser.py
index e9506f660e38..865a7cfa6933 100644
--- a/tests/pipelines/unidiffuser/test_unidiffuser.py
+++ b/tests/pipelines/unidiffuser/test_unidiffuser.py
@@ -20,8 +20,8 @@
     UniDiffuserPipeline,
     UniDiffuserTextDecoder,
 )
-from diffusers.utils import floats_tensor, load_image, nightly, randn_tensor, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import floats_tensor, load_image, nightly, require_torch_gpu, slow, torch_device
+from diffusers.utils.torch_utils import randn_tensor
 
 from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
diff --git a/tests/pipelines/vq_diffusion/test_vq_diffusion.py b/tests/pipelines/vq_diffusion/test_vq_diffusion.py
index 462d818f92d6..88e9f19df709 100644
--- a/tests/pipelines/vq_diffusion/test_vq_diffusion.py
+++ b/tests/pipelines/vq_diffusion/test_vq_diffusion.py
@@ -22,8 +22,7 @@
 
 from diffusers import Transformer2DModel, VQDiffusionPipeline, VQDiffusionScheduler, VQModel
 from diffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings
-from diffusers.utils import load_numpy, nightly, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, torch_device
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py
index 7d2e98030b30..9b680da27871 100644
--- a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py
+++ b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py
@@ -21,8 +21,7 @@
 
 from diffusers import DDPMWuerstchenScheduler, WuerstchenCombinedPipeline
 from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt, WuerstchenPrior
-from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
 
 from ..test_pipelines_common import PipelineTesterMixin
 
diff --git a/tests/pipelines/wuerstchen/test_wuerstchen_decoder.py b/tests/pipelines/wuerstchen/test_wuerstchen_decoder.py
index 709e2c1a3436..7891056d10c5 100644
--- a/tests/pipelines/wuerstchen/test_wuerstchen_decoder.py
+++ b/tests/pipelines/wuerstchen/test_wuerstchen_decoder.py
@@ -21,8 +21,7 @@
 
 from diffusers import DDPMWuerstchenScheduler, WuerstchenDecoderPipeline
 from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt
-from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
 
 from ..test_pipelines_common import PipelineTesterMixin
 
diff --git a/tests/pipelines/wuerstchen/test_wuerstchen_prior.py b/tests/pipelines/wuerstchen/test_wuerstchen_prior.py
index a255a665c48e..045729b30b6c 100644
--- a/tests/pipelines/wuerstchen/test_wuerstchen_prior.py
+++ b/tests/pipelines/wuerstchen/test_wuerstchen_prior.py
@@ -21,8 +21,7 @@
 
 from diffusers import DDPMWuerstchenScheduler, WuerstchenPriorPipeline
 from diffusers.pipelines.wuerstchen import WuerstchenPrior
-from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
 
 from ..test_pipelines_common import PipelineTesterMixin
 
@@ -146,7 +145,6 @@ def test_wuerstchen_prior(self):
 
         image_slice = image[0, 0, 0, -10:]
         image_from_tuple_slice = image_from_tuple[0, 0, 0, -10:]
-
         assert image.shape == (1, 2, 24, 24)
 
         expected_slice = np.array(
@@ -161,7 +159,7 @@ def test_wuerstchen_prior(self):
                 218.00089,
                 -2731.5745,
                 -8056.734,
-            ],
+            ]
         )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -176,7 +174,7 @@ def test_inference_batch_single_identical(self):
             test_max_difference=test_max_difference,
             relax_max_difference=relax_max_difference,
             test_mean_pixel_difference=test_mean_pixel_difference,
-            expected_max_diff=1e-1,
+            expected_max_diff=2e-1,
         )
 
     @skip_mps
diff --git a/tests/schedulers/test_scheduler_dpm_sde.py b/tests/schedulers/test_scheduler_dpm_sde.py
index 7906c8d5d4e9..253a0a478b41 100644
--- a/tests/schedulers/test_scheduler_dpm_sde.py
+++ b/tests/schedulers/test_scheduler_dpm_sde.py
@@ -1,8 +1,7 @@
 import torch
 
 from diffusers import DPMSolverSDEScheduler
-from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import require_torchsde
+from diffusers.utils.testing_utils import require_torchsde, torch_device
 
 from .test_schedulers import SchedulerCommonTest
 
diff --git a/tests/schedulers/test_scheduler_euler.py b/tests/schedulers/test_scheduler_euler.py
index 0c3b065161db..2aba46ba3381 100644
--- a/tests/schedulers/test_scheduler_euler.py
+++ b/tests/schedulers/test_scheduler_euler.py
@@ -1,7 +1,7 @@
 import torch
 
 from diffusers import EulerDiscreteScheduler
-from diffusers.utils import torch_device
+from diffusers.utils.testing_utils import torch_device
 
 from .test_schedulers import SchedulerCommonTest
 
diff --git a/tests/schedulers/test_scheduler_euler_ancestral.py b/tests/schedulers/test_scheduler_euler_ancestral.py
index 9866bd12d6af..b2887e89b720 100644
--- a/tests/schedulers/test_scheduler_euler_ancestral.py
+++ b/tests/schedulers/test_scheduler_euler_ancestral.py
@@ -1,7 +1,7 @@
 import torch
 
 from diffusers import EulerAncestralDiscreteScheduler
-from diffusers.utils import torch_device
+from diffusers.utils.testing_utils import torch_device
 
 from .test_schedulers import SchedulerCommonTest
 
diff --git a/tests/schedulers/test_scheduler_heun.py b/tests/schedulers/test_scheduler_heun.py
index ae0fe26b11ba..69f6526b673a 100644
--- a/tests/schedulers/test_scheduler_heun.py
+++ b/tests/schedulers/test_scheduler_heun.py
@@ -1,7 +1,7 @@
 import torch
 
 from diffusers import HeunDiscreteScheduler
-from diffusers.utils import torch_device
+from diffusers.utils.testing_utils import torch_device
 
 from .test_schedulers import SchedulerCommonTest
 
diff --git a/tests/schedulers/test_scheduler_kdpm2_ancestral.py b/tests/schedulers/test_scheduler_kdpm2_ancestral.py
index 45371121e66b..b3d391ac8a83 100644
--- a/tests/schedulers/test_scheduler_kdpm2_ancestral.py
+++ b/tests/schedulers/test_scheduler_kdpm2_ancestral.py
@@ -1,7 +1,7 @@
 import torch
 
 from diffusers import KDPM2AncestralDiscreteScheduler
-from diffusers.utils import torch_device
+from diffusers.utils.testing_utils import torch_device
 
 from .test_schedulers import SchedulerCommonTest
 
diff --git a/tests/schedulers/test_scheduler_kdpm2_discrete.py b/tests/schedulers/test_scheduler_kdpm2_discrete.py
index 4f1bd1f8aeb7..4876caaa996f 100644
--- a/tests/schedulers/test_scheduler_kdpm2_discrete.py
+++ b/tests/schedulers/test_scheduler_kdpm2_discrete.py
@@ -1,7 +1,7 @@
 import torch
 
 from diffusers import KDPM2DiscreteScheduler
-from diffusers.utils import torch_device
+from diffusers.utils.testing_utils import torch_device
 
 from .test_schedulers import SchedulerCommonTest
 
diff --git a/tests/schedulers/test_scheduler_lms.py b/tests/schedulers/test_scheduler_lms.py
index 1e0a8212354d..cd5376d305c4 100644
--- a/tests/schedulers/test_scheduler_lms.py
+++ b/tests/schedulers/test_scheduler_lms.py
@@ -1,7 +1,7 @@
 import torch
 
 from diffusers import LMSDiscreteScheduler
-from diffusers.utils import torch_device
+from diffusers.utils.testing_utils import torch_device
 
 from .test_schedulers import SchedulerCommonTest
 
diff --git a/tests/schedulers/test_schedulers.py b/tests/schedulers/test_schedulers.py
index 4b1834f62a4e..b936b6334627 100755
--- a/tests/schedulers/test_schedulers.py
+++ b/tests/schedulers/test_schedulers.py
@@ -40,8 +40,7 @@
 )
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.schedulers.scheduling_utils import SchedulerMixin
-from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import CaptureLogger
+from diffusers.utils.testing_utils import CaptureLogger, torch_device
 
 from ..others.test_utils import TOKEN, USER, is_staging_test
 
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 0ba573bb920e..df5816b4ac03 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -15,7 +15,6 @@
 
 import argparse
 import glob
-import importlib.util
 import os
 import re
 
@@ -29,15 +28,6 @@
 REPO_PATH = "."
 
 
-# This is to make sure the diffusers module imported is the one in the repo.
-spec = importlib.util.spec_from_file_location(
-    "diffusers",
-    os.path.join(DIFFUSERS_PATH, "__init__.py"),
-    submodule_search_locations=[DIFFUSERS_PATH],
-)
-diffusers_module = spec.loader.load_module()
-
-
 def _should_continue(line, indent):
     return line.startswith(indent) or len(line) <= 1 or re.search(r"^\s*\)(\s*->.*:|:)\s*$", line) is not None
 
diff --git a/utils/check_dummies.py b/utils/check_dummies.py
index 16b7c8c117dc..8754babc554b 100644
--- a/utils/check_dummies.py
+++ b/utils/check_dummies.py
@@ -71,24 +71,27 @@ def read_init():
 
     # Get to the point we do the actual imports for type checking
     line_index = 0
+    while not lines[line_index].startswith("if TYPE_CHECKING"):
+        line_index += 1
+
     backend_specific_objects = {}
     # Go through the end of the file
     while line_index < len(lines):
         # If the line contains is_backend_available, we grab all objects associated with the `else` block
         backend = find_backend(lines[line_index])
         if backend is not None:
-            while not lines[line_index].startswith("else:"):
+            while not lines[line_index].startswith("    else:"):
                 line_index += 1
             line_index += 1
             objects = []
             # Until we unindent, add backend objects to the list
-            while line_index < len(lines) and len(lines[line_index]) > 1:
+            while len(lines[line_index]) <= 1 or lines[line_index].startswith(" " * 8):
                 line = lines[line_index]
                 single_line_import_search = _re_single_line_import.search(line)
                 if single_line_import_search is not None:
                     objects.extend(single_line_import_search.groups()[0].split(", "))
-                elif line.startswith(" " * 8):
-                    objects.append(line[8:-2])
+                elif line.startswith(" " * 12):
+                    objects.append(line[12:-2])
                 line_index += 1
 
             if len(objects) > 0:

From 2c60f7d14e5297a61301c8bb2698717c244d3e43 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 11 Sep 2023 11:22:40 +0200
Subject: [PATCH 11/37] [Core] Remove TF import checks (#4968)

[TF] Remove tf
---
 src/diffusers/utils/__init__.py     |  1 -
 src/diffusers/utils/import_utils.py | 41 -----------------------------
 utils/check_repo.py                 |  8 +-----
 3 files changed, 1 insertion(+), 49 deletions(-)

diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index a846f6caef08..7390a2f69d23 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -69,7 +69,6 @@
     is_onnx_available,
     is_scipy_available,
     is_tensorboard_available,
-    is_tf_available,
     is_torch_available,
     is_torch_version,
     is_torchsde_available,
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index 1cf319e2941b..587949ab0c52 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -62,43 +62,6 @@
     logger.info("Disabling PyTorch because USE_TORCH is set")
     _torch_available = False
 
-
-_tf_version = "N/A"
-if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
-    _tf_available = importlib.util.find_spec("tensorflow") is not None
-    if _tf_available:
-        candidates = (
-            "tensorflow",
-            "tensorflow-cpu",
-            "tensorflow-gpu",
-            "tf-nightly",
-            "tf-nightly-cpu",
-            "tf-nightly-gpu",
-            "intel-tensorflow",
-            "intel-tensorflow-avx512",
-            "tensorflow-rocm",
-            "tensorflow-macos",
-            "tensorflow-aarch64",
-        )
-        _tf_version = None
-        # For the metadata, we have to look for both tensorflow and tensorflow-cpu
-        for pkg in candidates:
-            try:
-                _tf_version = importlib_metadata.version(pkg)
-                break
-            except importlib_metadata.PackageNotFoundError:
-                pass
-        _tf_available = _tf_version is not None
-    if _tf_available:
-        if version.parse(_tf_version) < version.parse("2"):
-            logger.info(f"TensorFlow found but with version {_tf_version}. Diffusers requires version 2 minimum.")
-            _tf_available = False
-        else:
-            logger.info(f"TensorFlow version {_tf_version} available.")
-else:
-    logger.info("Disabling Tensorflow because USE_TORCH is set")
-    _tf_available = False
-
 _jax_version = "N/A"
 _flax_version = "N/A"
 if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
@@ -308,10 +271,6 @@ def is_torch_available():
     return _torch_available
 
 
-def is_tf_available():
-    return _tf_available
-
-
 def is_flax_available():
     return _flax_available
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index dffbb323f917..6f0417d69065 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -23,7 +23,7 @@
 from pathlib import Path
 
 from diffusers.models.auto import get_values
-from diffusers.utils import ENV_VARS_TRUE_VALUES, is_flax_available, is_tf_available, is_torch_available
+from diffusers.utils import ENV_VARS_TRUE_VALUES, is_flax_available, is_torch_available
 
 
 # All paths are set with the intent you should run this script from the root of the repo with the command
@@ -421,10 +421,6 @@ def get_all_auto_configured_models():
         for attr_name in dir(diffusers.models.auto.modeling_auto):
             if attr_name.startswith("MODEL_") and attr_name.endswith("MAPPING_NAMES"):
                 result = result | set(get_values(getattr(diffusers.models.auto.modeling_auto, attr_name)))
-    if is_tf_available():
-        for attr_name in dir(diffusers.models.auto.modeling_tf_auto):
-            if attr_name.startswith("TF_MODEL_") and attr_name.endswith("MAPPING_NAMES"):
-                result = result | set(get_values(getattr(diffusers.models.auto.modeling_tf_auto, attr_name)))
     if is_flax_available():
         for attr_name in dir(diffusers.models.auto.modeling_flax_auto):
             if attr_name.startswith("FLAX_MODEL_") and attr_name.endswith("MAPPING_NAMES"):
@@ -462,8 +458,6 @@ def check_all_models_are_auto_configured():
     missing_backends = []
     if not is_torch_available():
         missing_backends.append("PyTorch")
-    if not is_tf_available():
-        missing_backends.append("TensorFlow")
     if not is_flax_available():
         missing_backends.append("Flax")
     if len(missing_backends) > 0:

From 6bbee1048bc5519fce91ebd81a592449b4f5f6c0 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 11 Sep 2023 12:03:49 +0200
Subject: [PATCH 12/37] Make sure Flax pipelines can be loaded into PyTorch
 (#4971)

* Make sure Flax pipelines can be loaded into PyTorch

* add test

* Update src/diffusers/pipelines/pipeline_utils.py
---
 src/diffusers/pipelines/pipeline_utils.py |  7 ++++++-
 tests/pipelines/test_pipelines.py         | 10 +++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index fb120ebc7d3b..b357fe6a8b29 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -342,7 +342,12 @@ def _get_pipeline_class(
         return class_obj
 
     diffusers_module = importlib.import_module(class_obj.__module__.split(".")[0])
-    pipeline_cls = getattr(diffusers_module, config["_class_name"])
+    class_name = config["_class_name"]
+
+    if class_name.startswith("Flax"):
+        class_name = class_name[4:]
+
+    pipeline_cls = getattr(diffusers_module, class_name)
 
     if load_connected_pipeline:
         from .auto_pipeline import _get_connected_pipeline
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index 927c5ec28518..5a0c300c60c4 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -57,7 +57,7 @@
     UniPCMultistepScheduler,
     logging,
 )
-from diffusers.pipelines.pipeline_utils import variant_compatible_siblings
+from diffusers.pipelines.pipeline_utils import _get_pipeline_class, variant_compatible_siblings
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import (
     CONFIG_NAME,
@@ -805,6 +805,14 @@ def test_download_ignore_files(self):
             assert not any(f in ["vae/diffusion_pytorch_model.bin", "text_encoder/config.json"] for f in files)
             assert len(files) == 14
 
+    def test_get_pipeline_class_from_flax(self):
+        flax_config = {"_class_name": "FlaxStableDiffusionPipeline"}
+        config = {"_class_name": "StableDiffusionPipeline"}
+
+        # when loading a PyTorch Pipeline from a FlaxPipeline `model_index.json`, e.g.: https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-lms-pipe/blob/7a9063578b325779f0f1967874a6771caa973cad/model_index.json#L2
+        # we need to make sure that we don't load the Flax Pipeline class, but instead the PyTorch pipeline class
+        assert _get_pipeline_class(DiffusionPipeline, flax_config) == _get_pipeline_class(DiffusionPipeline, config)
+
 
 class CustomPipelineTests(unittest.TestCase):
     def test_load_custom_pipeline(self):

From 6c6a246461829d7ba6af26655c7418ad228a28f3 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 11 Sep 2023 15:45:19 +0200
Subject: [PATCH 13/37] Update README.md (#4973)

Add monthly pip installs
---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index ec6bddbc1fbf..9f80fecf2222 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,9 @@
     <a href="https://github.com/huggingface/diffusers/releases">
         <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/diffusers.svg">
     </a>
+    <a href="https://pepy.tech/project/diffusers">
+        <img alt="GitHub release" src="https://static.pepy.tech/badge/diffusers/month">
+    </a>
     <a href="CODE_OF_CONDUCT.md">
         <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg">
     </a>

From 16a056a7b5801c3b93cc245680ade7e77ed0bd3a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 11 Sep 2023 15:47:53 +0200
Subject: [PATCH 14/37] Wuerstchen fixes (#4942)

* fix arguments and make example code work

* change arguments in combined test

* Add default timesteps

* style

* fixed test

* fix broken test

* formatting

* fix docstrings

* fix  num_images_per_prompt

* fix doc styles

* please dont change this

* fix tests

* rename to DEFAULT_STAGE_C_TIMESTEPS

---------

Co-authored-by: Dominic Rampas <d6582533@gmail.com>
---
 docs/source/en/api/pipelines/wuerstchen.md    | 41 +++++++----------
 scripts/convert_wuerstchen.py                 |  6 +--
 .../pipelines/wuerstchen/__init__.py          |  2 +-
 .../wuerstchen/pipeline_wuerstchen.py         |  8 ++--
 .../pipeline_wuerstchen_combined.py           | 46 ++++++++++---------
 .../wuerstchen/pipeline_wuerstchen_prior.py   | 21 +++++++--
 .../wuerstchen/test_wuerstchen_combined.py    |  5 +-
 7 files changed, 69 insertions(+), 60 deletions(-)

diff --git a/docs/source/en/api/pipelines/wuerstchen.md b/docs/source/en/api/pipelines/wuerstchen.md
index 9ead9456bfb3..797248ebb4b7 100644
--- a/docs/source/en/api/pipelines/wuerstchen.md
+++ b/docs/source/en/api/pipelines/wuerstchen.md
@@ -17,6 +17,7 @@ After the initial paper release, we have improved numerous things in the archite
 - Multi Aspect Resolution Sampling
 - Better quality
 
+
 We are releasing 3 checkpoints for the text-conditional image generation model (Stage C). Those are: 
 
 - v2-base
@@ -24,7 +25,7 @@ We are releasing 3 checkpoints for the text-conditional image generation model (
 - v2-interpolated (50% interpolation between v2-base and v2-aesthetic)
 
 We recommend to use v2-interpolated, as it has a nice touch of both photorealism and aesthetic. Use v2-base for finetunings as it does not have a style bias and use v2-aesthetic for very artistic generations.
-A comparison can be seen here: 
+A comparison can be seen here:
 
 <img src="https://github.com/dome272/Wuerstchen/assets/61938694/2914830f-cbd3-461c-be64-d50734f4b49d" width=500>
 
@@ -35,27 +36,18 @@ For the sake of usability Würstchen can be used with a single pipeline. This pi
 ```python
 import torch
 from diffusers import AutoPipelineForText2Image
+from diffusers.pipelines.wuerstchen import DEFAULT_STAGE_C_TIMESTEPS
 
-device = "cuda"
-dtype = torch.float16
-num_images_per_prompt = 2
-
-pipeline =  AutoPipelineForText2Image.from_pretrained(
-    "warp-diffusion/wuerstchen", torch_dtype=dtype
-).to(device)
+pipe = AutoPipelineForText2Image.from_pretrained("warp-ai/wuerstchen", torch_dtype=torch.float16).to("cuda")
 
 caption = "Anthropomorphic cat dressed as a fire fighter"
-negative_prompt = ""
-
-output = pipeline(
-    prompt=caption,
-    height=1024,
+images = pipe(
+    caption, 
     width=1024,
-    negative_prompt=negative_prompt,
+    height=1536,
+    prior_timesteps=DEFAULT_STAGE_C_TIMESTEPS,
     prior_guidance_scale=4.0,
-    decoder_guidance_scale=0.0,
-    num_images_per_prompt=num_images_per_prompt,
-    output_type="pil",
+    num_images_per_prompt=2,
 ).images
 ```
 
@@ -64,27 +56,29 @@ For explanation purposes, we can also initialize the two main pipelines of Würs
 ```python
 import torch
 from diffusers import WuerstchenDecoderPipeline, WuerstchenPriorPipeline
+from diffusers.pipelines.wuerstchen import DEFAULT_STAGE_C_TIMESTEPS
 
 device = "cuda"
 dtype = torch.float16
 num_images_per_prompt = 2
 
 prior_pipeline = WuerstchenPriorPipeline.from_pretrained(
-    "warp-diffusion/wuerstchen-prior", torch_dtype=dtype
+    "warp-ai/wuerstchen-prior", torch_dtype=dtype
 ).to(device)
 decoder_pipeline = WuerstchenDecoderPipeline.from_pretrained(
-    "warp-diffusion/wuerstchen", torch_dtype=dtype
+    "warp-ai/wuerstchen", torch_dtype=dtype
 ).to(device)
 
-caption = "A captivating artwork of a mysterious stone golem"
+caption = "Anthropomorphic cat dressed as a fire fighter"
 negative_prompt = ""
 
 prior_output = prior_pipeline(
     prompt=caption,
     height=1024,
-    width=1024,
+    width=1536,
+    timesteps=DEFAULT_STAGE_C_TIMESTEPS,
     negative_prompt=negative_prompt,
-	guidance_scale=4.0,
+    guidance_scale=4.0,
     num_images_per_prompt=num_images_per_prompt,
 )
 decoder_output = decoder_pipeline(
@@ -109,13 +103,12 @@ pipeline.decoder = torch.compile(pipeline.decoder, mode="reduce-overhead", fullg
 
 - Due to the high compression employed by Würstchen, generations can lack a good amount
 of detail. To our human eye, this is especially noticeable in faces, hands etc.
-- **Images can only be generated in 128-pixel steps**, e.g. the next higher resolution 
+- **Images can only be generated in 128-pixel steps**, e.g. the next higher resolution
 after 1024x1024 is 1152x1152
 - The model lacks the ability to render correct text in images
 - The model often does not achieve photorealism
 - Difficult compositional prompts are hard for the model
 
-
 The original codebase, as well as experimental ideas, can be found at [dome272/Wuerstchen](https://github.com/dome272/Wuerstchen).
 
 ## WuerschenPipeline
diff --git a/scripts/convert_wuerstchen.py b/scripts/convert_wuerstchen.py
index 91fd9b79b4ee..23d45d3dd6ad 100644
--- a/scripts/convert_wuerstchen.py
+++ b/scripts/convert_wuerstchen.py
@@ -91,12 +91,12 @@
     prior=prior_model, text_encoder=text_encoder, tokenizer=tokenizer, scheduler=scheduler
 )
 
-prior_pipeline.save_pretrained("warp-diffusion/wuerstchen-prior")
+prior_pipeline.save_pretrained("warp-ai/wuerstchen-prior")
 
 decoder_pipeline = WuerstchenDecoderPipeline(
     text_encoder=gen_text_encoder, tokenizer=gen_tokenizer, vqgan=vqmodel, decoder=deocder, scheduler=scheduler
 )
-decoder_pipeline.save_pretrained("warp-diffusion/wuerstchen")
+decoder_pipeline.save_pretrained("warp-ai/wuerstchen")
 
 # Wuerstchen pipeline
 wuerstchen_pipeline = WuerstchenCombinedPipeline(
@@ -112,4 +112,4 @@
     prior=prior_model,
     prior_scheduler=scheduler,
 )
-wuerstchen_pipeline.save_pretrained("warp-diffusion/WuerstchenCombinedPipeline")
+wuerstchen_pipeline.save_pretrained("warp-ai/WuerstchenCombinedPipeline")
diff --git a/src/diffusers/pipelines/wuerstchen/__init__.py b/src/diffusers/pipelines/wuerstchen/__init__.py
index f77b597a0b92..17da4c1ad8c5 100644
--- a/src/diffusers/pipelines/wuerstchen/__init__.py
+++ b/src/diffusers/pipelines/wuerstchen/__init__.py
@@ -24,7 +24,7 @@
     _import_structure["modeling_wuerstchen_prior"] = ["WuerstchenPrior"]
     _import_structure["pipeline_wuerstchen"] = ["WuerstchenDecoderPipeline"]
     _import_structure["pipeline_wuerstchen_combined"] = ["WuerstchenCombinedPipeline"]
-    _import_structure["pipeline_wuerstchen_prior"] = ["WuerstchenPriorPipeline"]
+    _import_structure["pipeline_wuerstchen_prior"] = ["DEFAULT_STAGE_C_TIMESTEPS", "WuerstchenPriorPipeline"]
 
 
 import sys
diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
index 7f6b0546da7b..4dfa4727ddd0 100644
--- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
+++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
@@ -35,11 +35,11 @@
         >>> from diffusers import WuerstchenPriorPipeline, WuerstchenDecoderPipeline
 
         >>> prior_pipe = WuerstchenPriorPipeline.from_pretrained(
-        ...     "warp-diffusion/wuerstchen-prior", torch_dtype=torch.float16
-        ... ).to("cuda")
-        >>> gen_pipe = WuerstchenDecoderPipeline.from_pretrain(
-        ...     "warp-diffusion/wuerstchen", torch_dtype=torch.float16
+        ...     "warp-ai/wuerstchen-prior", torch_dtype=torch.float16
         ... ).to("cuda")
+        >>> gen_pipe = WuerstchenDecoderPipeline.from_pretrain("warp-ai/wuerstchen", torch_dtype=torch.float16).to(
+        ...     "cuda"
+        ... )
 
         >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
         >>> prior_output = pipe(prompt)
diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
index ff4c31686bf5..590162bd0d16 100644
--- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
+++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
@@ -31,9 +31,9 @@
         ```py
         >>> from diffusions import WuerstchenCombinedPipeline
 
-        >>> pipe = WuerstchenCombinedPipeline.from_pretrained(
-        ...     "warp-diffusion/Wuerstchen", torch_dtype=torch.float16
-        ... ).to("cuda")
+        >>> pipe = WuerstchenCombinedPipeline.from_pretrained("warp-ai/Wuerstchen", torch_dtype=torch.float16).to(
+        ...     "cuda"
+        ... )
         >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
         >>> images = pipe(prompt=prompt)
         ```
@@ -145,16 +145,16 @@ def set_progress_bar_config(self, **kwargs):
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        guidance_scale: float = 4.0,
-        num_images_per_prompt: int = 1,
         height: int = 512,
         width: int = 512,
-        prior_guidance_scale: float = 4.0,
         prior_num_inference_steps: int = 60,
-        num_inference_steps: int = 12,
         prior_timesteps: Optional[List[float]] = None,
-        timesteps: Optional[List[float]] = None,
+        prior_guidance_scale: float = 4.0,
+        num_inference_steps: int = 12,
+        decoder_timesteps: Optional[List[float]] = None,
+        decoder_guidance_scale: float = 0.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         output_type: Optional[str] = "pil",
@@ -182,19 +182,20 @@ def __call__(
                 `prior_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked
                 to the text `prompt`, usually at the expense of lower image quality.
             prior_num_inference_steps (`Union[int, Dict[float, int]]`, *optional*, defaults to 30):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                The number of prior denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference. For more specific timestep spacing, you can pass customized
                 `prior_timesteps`
             num_inference_steps (`int`, *optional*, defaults to 12):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. For more specific timestep spacing, you can pass customized `timesteps`
+                The number of decoder denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. For more specific timestep spacing, you can pass customized
+                `timesteps`
             prior_timesteps (`List[float]`, *optional*):
                 Custom timesteps to use for the denoising process for the prior. If not defined, equal spaced
                 `prior_num_inference_steps` timesteps are used. Must be in descending order.
-            timesteps (`List[float]`, *optional*):
+            decoder_timesteps (`List[float]`, *optional*):
                 Custom timesteps to use for the denoising process for the decoder. If not defined, equal spaced
-                `decoder_num_inference_steps` timesteps are used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to 4.0):
+                `num_inference_steps` timesteps are used. Must be in descending order.
+            decoder_guidance_scale (`float`, *optional*, defaults to 0.0):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
@@ -221,27 +222,28 @@ def __call__(
         """
         prior_outputs = self.prior_pipe(
             prompt=prompt,
-            negative_prompt=negative_prompt,
-            width=width,
             height=height,
-            num_images_per_prompt=num_images_per_prompt,
+            width=width,
             num_inference_steps=prior_num_inference_steps,
             timesteps=prior_timesteps,
+            guidance_scale=prior_guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
             generator=generator,
             latents=latents,
-            guidance_scale=prior_guidance_scale,
             output_type="pt",
             return_dict=False,
         )
         image_embeddings = prior_outputs[0]
 
         outputs = self.decoder_pipe(
-            prompt=prompt,
             image_embeddings=image_embeddings,
+            prompt=prompt,
             num_inference_steps=num_inference_steps,
-            timesteps=timesteps,
+            timesteps=decoder_timesteps,
+            guidance_scale=decoder_guidance_scale,
+            negative_prompt=negative_prompt,
             generator=generator,
-            guidance_scale=guidance_scale,
             output_type=output_type,
             return_dict=return_dict,
         )
diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
index 297462bd96f7..9993d30b2072 100644
--- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
+++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
@@ -14,7 +14,7 @@
 
 from dataclasses import dataclass
 from math import ceil
-from typing import List, Optional, Union
+from typing import Callable, List, Optional, Union
 
 import numpy as np
 import torch
@@ -35,6 +35,8 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+DEFAULT_STAGE_C_TIMESTEPS = list(np.linspace(1.0, 2 / 3, 20)) + list(np.linspace(2 / 3, 0.0, 11))[1:]
+
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
@@ -42,7 +44,7 @@
         >>> from diffusers import WuerstchenPriorPipeline
 
         >>> prior_pipe = WuerstchenPriorPipeline.from_pretrained(
-        ...     "warp-diffusion/wuerstchen-prior", torch_dtype=torch.float16
+        ...     "warp-ai/wuerstchen-prior", torch_dtype=torch.float16
         ... ).to("cuda")
 
         >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
@@ -265,7 +267,7 @@ def __call__(
         prompt: Union[str, List[str]] = None,
         height: int = 1024,
         width: int = 1024,
-        num_inference_steps: int = 30,
+        num_inference_steps: int = 60,
         timesteps: List[float] = None,
         guidance_scale: float = 8.0,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -274,6 +276,8 @@ def __call__(
         latents: Optional[torch.FloatTensor] = None,
         output_type: Optional[str] = "pt",
         return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
     ):
         """
         Function invoked when calling the pipeline for generation.
@@ -314,6 +318,12 @@ def __call__(
                 (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
 
         Examples:
 
@@ -365,7 +375,7 @@ def __call__(
         latents = self.prepare_latents(effnet_features_shape, dtype, device, generator, latents, self.scheduler)
 
         # 6. Run denoising loop
-        for t in self.progress_bar(timesteps[:-1]):
+        for i, t in enumerate(self.progress_bar(timesteps[:-1])):
             ratio = t.expand(latents.size(0)).to(dtype)
 
             # 7. Denoise image embeddings
@@ -390,6 +400,9 @@ def __call__(
                 generator=generator,
             ).prev_sample
 
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
         # 10. Denormalize the latents
         latents = latents * self.config.latent_mean - self.config.latent_std
 
diff --git a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py
index 9b680da27871..b567f507d1d2 100644
--- a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py
+++ b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py
@@ -38,7 +38,8 @@ class WuerstchenCombinedPipelineFastTests(PipelineTesterMixin, unittest.TestCase
         "height",
         "width",
         "latents",
-        "guidance_scale",
+        "prior_guidance_scale",
+        "decoder_guidance_scale",
         "negative_prompt",
         "num_inference_steps",
         "return_dict",
@@ -160,7 +161,7 @@ def get_dummy_inputs(self, device, seed=0):
             "prompt": "horse",
             "generator": generator,
             "prior_guidance_scale": 4.0,
-            "guidance_scale": 4.0,
+            "decoder_guidance_scale": 4.0,
             "num_inference_steps": 2,
             "prior_num_inference_steps": 2,
             "output_type": "np",

From 93579650f8f3fd2c49d665c7dc582d5111583d02 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 11 Sep 2023 19:39:26 +0200
Subject: [PATCH 15/37] Refactor model offload (#4514)

* [Draft] Refactor model offload

* [Draft] Refactor model offload

* Apply suggestions from code review

* cpu offlaod updates

* remove model cpu offload from individual pipelines

* add hook to offload models to cpu

* clean up

* model offload

* add model cpu offload string

* make style

* clean up

* fixes for offload issues

* fix tests issues

* resolve merge conflicts

* update src/diffusers/pipelines/pipeline_utils.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* make style

* Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py

---------

Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
---
 .../alt_diffusion/pipeline_alt_diffusion.py   | 36 +---------
 .../pipeline_alt_diffusion_img2img.py         | 36 +---------
 .../pipelines/audioldm/pipeline_audioldm.py   |  1 +
 .../pipelines/audioldm2/pipeline_audioldm2.py |  2 +
 .../pipeline_consistency_models.py            | 36 +---------
 .../controlnet/pipeline_controlnet.py         | 36 +---------
 .../controlnet/pipeline_controlnet_img2img.py | 36 +---------
 .../controlnet/pipeline_controlnet_inpaint.py | 36 +---------
 .../pipeline_controlnet_inpaint_sd_xl.py      | 33 +--------
 .../controlnet/pipeline_controlnet_sd_xl.py   | 58 ++++------------
 .../pipeline_controlnet_sd_xl_img2img.py      | 35 +---------
 .../pipeline_dance_diffusion.py               |  1 +
 src/diffusers/pipelines/ddim/pipeline_ddim.py |  1 +
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py |  1 +
 .../pipelines/deepfloyd_if/pipeline_if.py     | 48 +------------
 .../deepfloyd_if/pipeline_if_img2img.py       | 49 +-------------
 .../pipeline_if_img2img_superresolution.py    | 49 +-------------
 .../deepfloyd_if/pipeline_if_inpainting.py    | 49 +-------------
 .../pipeline_if_inpainting_superresolution.py | 44 +-----------
 .../pipeline_if_superresolution.py            | 49 +-------------
 src/diffusers/pipelines/dit/pipeline_dit.py   |  1 +
 .../pipelines/kandinsky/pipeline_kandinsky.py | 29 +-------
 .../kandinsky/pipeline_kandinsky_combined.py  | 33 +--------
 .../kandinsky/pipeline_kandinsky_img2img.py   | 29 +-------
 .../kandinsky/pipeline_kandinsky_inpaint.py   | 29 +-------
 .../kandinsky/pipeline_kandinsky_prior.py     | 35 +---------
 .../kandinsky2_2/pipeline_kandinsky2_2.py     | 33 +--------
 .../pipeline_kandinsky2_2_combined.py         | 23 +------
 .../pipeline_kandinsky2_2_controlnet.py       | 37 ++--------
 ...ipeline_kandinsky2_2_controlnet_img2img.py | 37 ++--------
 .../pipeline_kandinsky2_2_img2img.py          | 37 ++--------
 .../pipeline_kandinsky2_2_inpainting.py       | 37 ++--------
 .../pipeline_kandinsky2_2_prior.py            | 32 +--------
 .../pipeline_kandinsky2_2_prior_emb2emb.py    | 32 +--------
 .../pipeline_latent_diffusion.py              |  1 +
 .../pipelines/musicldm/pipeline_musicldm.py   | 44 +++++++++++-
 .../pipeline_paint_by_example.py              |  5 ++
 src/diffusers/pipelines/pipeline_utils.py     | 67 +++++++++++++++++++
 .../pipelines/repaint/pipeline_repaint.py     |  1 +
 .../pipeline_semantic_stable_diffusion.py     |  1 +
 .../pipelines/shap_e/pipeline_shap_e.py       | 40 ++---------
 .../shap_e/pipeline_shap_e_img2img.py         |  8 ++-
 .../pipeline_cycle_diffusion.py               | 32 +--------
 .../pipeline_stable_diffusion.py              | 36 +---------
 ...line_stable_diffusion_attend_and_excite.py |  1 +
 .../pipeline_stable_diffusion_depth2img.py    |  1 +
 .../pipeline_stable_diffusion_diffedit.py     | 47 ++-----------
 .../pipeline_stable_diffusion_gligen.py       | 31 +--------
 ...line_stable_diffusion_gligen_text_image.py | 31 +--------
 ...peline_stable_diffusion_image_variation.py |  3 +
 .../pipeline_stable_diffusion_img2img.py      | 37 +---------
 .../pipeline_stable_diffusion_inpaint.py      | 37 ++--------
 ...ipeline_stable_diffusion_inpaint_legacy.py | 37 ++--------
 ...eline_stable_diffusion_instruct_pix2pix.py | 37 ++--------
 .../pipeline_stable_diffusion_k_diffusion.py  | 37 ++--------
 ...ipeline_stable_diffusion_latent_upscale.py |  1 +
 .../pipeline_stable_diffusion_ldm3d.py        | 37 +---------
 ...pipeline_stable_diffusion_model_editing.py |  6 +-
 .../pipeline_stable_diffusion_panorama.py     |  1 +
 .../pipeline_stable_diffusion_paradigms.py    | 37 +---------
 .../pipeline_stable_diffusion_pix2pix_zero.py | 37 ++--------
 .../pipeline_stable_diffusion_sag.py          |  1 +
 .../pipeline_stable_diffusion_upscale.py      | 29 +-------
 .../pipeline_stable_unclip.py                 | 28 +-------
 .../pipeline_stable_unclip_img2img.py         | 30 +--------
 .../pipeline_stable_diffusion_safe.py         |  1 +
 .../pipeline_stable_diffusion_xl.py           | 46 +++----------
 .../pipeline_stable_diffusion_xl_img2img.py   | 37 ++--------
 .../pipeline_stable_diffusion_xl_inpaint.py   | 38 ++---------
 ...ne_stable_diffusion_xl_instruct_pix2pix.py | 40 +----------
 .../pipeline_stable_diffusion_adapter.py      | 36 +---------
 .../pipeline_stable_diffusion_xl_adapter.py   | 34 +---------
 .../pipeline_text_to_video_synth.py           | 33 +--------
 .../pipeline_text_to_video_synth_img2img.py   | 33 +--------
 .../pipeline_text_to_video_zero.py            |  5 +-
 .../pipelines/unclip/pipeline_unclip.py       |  2 +
 .../unclip/pipeline_unclip_image_variation.py |  1 +
 .../unidiffuser/pipeline_unidiffuser.py       | 15 ++++-
 ...ipeline_versatile_diffusion_dual_guided.py |  2 +
 ...ine_versatile_diffusion_image_variation.py |  2 +
 ...eline_versatile_diffusion_text_to_image.py |  2 +
 tests/pipelines/audioldm2/test_audioldm2.py   | 22 +-----
 ...test_stable_diffusion_attend_and_excite.py |  4 +-
 .../test_stable_diffusion_latent_upscale.py   |  4 +-
 tests/pipelines/test_pipelines_common.py      | 34 +++++++++-
 85 files changed, 370 insertions(+), 1822 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index 7af8027ed763..9b56af541d8a 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -19,8 +19,6 @@
 from packaging import version
 from transformers import CLIPImageProcessor, XLMRobertaTokenizer
 
-from diffusers.utils import is_accelerate_available, is_accelerate_version
-
 from ...configuration_utils import FrozenDict
 from ...image_processor import VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
@@ -100,6 +98,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -221,34 +220,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     def _encode_prompt(
         self,
         prompt,
@@ -750,9 +721,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index a7219446d273..0f01844395cf 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -21,8 +21,6 @@
 from packaging import version
 from transformers import CLIPImageProcessor, XLMRobertaTokenizer
 
-from diffusers.utils import is_accelerate_available, is_accelerate_version
-
 from ...configuration_utils import FrozenDict
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
@@ -127,6 +125,7 @@ class AltDiffusionImg2ImgPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -219,34 +218,6 @@ def __init__(
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     def _encode_prompt(
         self,
         prompt,
@@ -773,9 +744,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index c95e45000133..31e09b728531 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -72,6 +72,7 @@ class AudioLDMPipeline(DiffusionPipeline):
         vocoder ([`~transformers.SpeechT5HifiGan`]):
             Vocoder of class `SpeechT5HifiGan`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index e5e03036caec..31b9266060b0 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -947,6 +947,8 @@ def __call__(
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
+        self.maybe_free_model_hooks()
+
         # 8. Post-processing
         if not output_type == "latent":
             latents = 1 / self.vae.config.scaling_factor * latents
diff --git a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
index 511c767aeaf4..de1b1fd93c7f 100644
--- a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
+++ b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
@@ -5,8 +5,6 @@
 from ...models import UNet2DModel
 from ...schedulers import CMStochasticIterativeScheduler
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -62,6 +60,7 @@ class ConsistencyModelPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Currently only
             compatible with [`CMStochasticIterativeScheduler`].
     """
+    model_cpu_offload_seq = "unet"
 
     def __init__(self, unet: UNet2DModel, scheduler: CMStochasticIterativeScheduler) -> None:
         super().__init__()
@@ -73,34 +72,6 @@ def __init__(self, unet: UNet2DModel, scheduler: CMStochasticIterativeScheduler)
 
         self.safety_checker = None
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.unet]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None):
         shape = (batch_size, num_channels, height, width)
         if isinstance(generator, list) and len(generator) != batch_size:
@@ -280,9 +251,8 @@ def __call__(
         # 6. Post-process image sample
         image = self.postprocess_image(sample, output_type=output_type)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image,)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
index bb569249e5f5..5c4e8fb0b555 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -29,8 +29,6 @@
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -125,6 +123,7 @@ class StableDiffusionControlNetPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -210,34 +209,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            # the safety checker can offload the vae again
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # control net hook has be manually offloaded as it alternates with unet
-        cpu_offload_with_hook(self.controlnet, device)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -1031,9 +1002,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
index 7a173d98d279..abfa7225d15e 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -28,8 +28,6 @@
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -149,6 +147,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -234,34 +233,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            # the safety checker can offload the vae again
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # control net hook has be manually offloaded as it alternates with unet
-        cpu_offload_with_hook(self.controlnet, device)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -1107,9 +1078,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index c933bf9ccee5..2f046c137e3f 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -30,8 +30,6 @@
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -273,6 +271,7 @@ class StableDiffusionControlNetInpaintPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -361,34 +360,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            # the safety checker can offload the vae again
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # control net hook has be manually offloaded as it alternates with unet
-        cpu_offload_with_hook(self.controlnet, device)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -1373,9 +1344,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
index 9d0dd462ba7e..6595d8f4566d 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -166,6 +166,7 @@ class StableDiffusionXLControlNetInpaintPipeline(DiffusionPipeline, LoraLoaderMi
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
     _optional_components = ["tokenizer", "text_encoder"]
 
     def __init__(
@@ -248,38 +249,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        cpu_offload_with_hook(self.controlnet, device)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
     def encode_prompt(
         self,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
index 50e13b76d664..e2f463329c3d 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -145,6 +145,9 @@ class StableDiffusionXLControlNetPipeline(
             watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no
             watermarker is used.
     """
+    model_cpu_offload_seq = (
+        "text_encoder->text_encoder_2->unet->vae"  # leave controlnet out on purpose because it iterates with unet
+    )
 
     def __init__(
         self,
@@ -221,38 +224,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        cpu_offload_with_hook(self.controlnet, device)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
     def encode_prompt(
         self,
@@ -1170,12 +1141,10 @@ def __call__(
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
-        # If we do sequential model offloading, let's offload unet and controlnet
         # manually for max memory savings
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.unet.to("cpu")
-            self.controlnet.to("cpu")
-            torch.cuda.empty_cache()
+        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+            self.upcast_vae()
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
 
         if not output_type == "latent":
             # make sure the VAE is in float32 mode, as it overflows in float16
@@ -1192,17 +1161,16 @@ def __call__(
                 self.vae.to(dtype=torch.float16)
         else:
             image = latents
-            return StableDiffusionXLPipelineOutput(images=image)
 
-        # apply watermark if available
-        if self.watermark is not None:
-            image = self.watermark.apply_watermark(image)
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
 
-        image = self.image_processor.postprocess(image, output_type=output_type)
+            image = self.image_processor.postprocess(image, output_type=output_type)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image,)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
index ca3bc8ca7754..8337b704450b 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -36,8 +36,6 @@
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -179,6 +177,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(DiffusionPipeline, TextualInver
             watermark output images. If not defined, it will default to True if the package is installed, otherwise no
             watermarker will be used.
     """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
     _optional_components = ["tokenizer", "text_encoder"]
 
     def __init__(
@@ -258,38 +257,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        cpu_offload_with_hook(self.controlnet, device)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
     def encode_prompt(
         self,
diff --git a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
index 77c57a1425d3..58326d5df471 100644
--- a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
+++ b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -39,6 +39,7 @@ class DanceDiffusionPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
             [`IPNDMScheduler`].
     """
+    model_cpu_offload_seq = "unet"
 
     def __init__(self, unet, scheduler):
         super().__init__()
diff --git a/src/diffusers/pipelines/ddim/pipeline_ddim.py b/src/diffusers/pipelines/ddim/pipeline_ddim.py
index dcb326ede058..527e3f04c0f4 100644
--- a/src/diffusers/pipelines/ddim/pipeline_ddim.py
+++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -35,6 +35,7 @@ class DDIMPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
             [`DDPMScheduler`], or [`DDIMScheduler`].
     """
+    model_cpu_offload_seq = "unet"
 
     def __init__(self, unet, scheduler):
         super().__init__()
diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index d34bea7f9cf0..a07988fca842 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -35,6 +35,7 @@ class DDPMPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
             [`DDPMScheduler`], or [`DDIMScheduler`].
     """
+    model_cpu_offload_seq = "unet"
 
     def __init__(self, unet, scheduler):
         super().__init__()
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
index 0f4e702268d4..a490a8904497 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
@@ -13,7 +13,6 @@
 from ...utils import (
     BACKENDS_MAPPING,
     is_accelerate_available,
-    is_accelerate_version,
     is_bs4_available,
     is_ftfy_available,
     logging,
@@ -103,6 +102,7 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
     )  # noqa
 
     _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
 
     def __init__(
         self,
@@ -144,47 +144,6 @@ def __init__(
         )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-
-        if self.text_encoder is not None:
-            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
-
-            # Accelerate will move the next model to the device _before_ calling the offload hook of the
-            # previous model. This will cause both models to be present on the device at the same time.
-            # IF uses T5 for its text encoder which is really large. We can manually call the offload
-            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
-            # the GPU.
-            self.text_encoder_offload_hook = hook
-
-        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
-
-        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
-        self.unet_offload_hook = hook
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     def remove_all_hooks(self):
         if is_accelerate_available():
             from accelerate.hooks import remove_hook_from_module
@@ -806,9 +765,8 @@ def __call__(
             # 9. Run safety checker
             image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, nsfw_detected, watermark_detected)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
index e14133f0e481..65a4e7d5f129 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
@@ -16,7 +16,6 @@
     BACKENDS_MAPPING,
     PIL_INTERPOLATION,
     is_accelerate_available,
-    is_accelerate_version,
     is_bs4_available,
     is_ftfy_available,
     logging,
@@ -127,6 +126,7 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
     )  # noqa
 
     _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
 
     def __init__(
         self,
@@ -168,48 +168,6 @@ def __init__(
         )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-
-        if self.text_encoder is not None:
-            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
-
-            # Accelerate will move the next model to the device _before_ calling the offload hook of the
-            # previous model. This will cause both models to be present on the device at the same time.
-            # IF uses T5 for its text encoder which is really large. We can manually call the offload
-            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
-            # the GPU.
-            self.text_encoder_offload_hook = hook
-
-        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
-
-        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
-        self.unet_offload_hook = hook
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
     def remove_all_hooks(self):
         if is_accelerate_available():
@@ -930,9 +888,8 @@ def __call__(
             # 9. Run safety checker
             image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, nsfw_detected, watermark_detected)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
index 20ac5a90e2cc..5b47df81668a 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
@@ -17,7 +17,6 @@
     BACKENDS_MAPPING,
     PIL_INTERPOLATION,
     is_accelerate_available,
-    is_accelerate_version,
     is_bs4_available,
     is_ftfy_available,
     logging,
@@ -131,6 +130,7 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
     )  # noqa
 
     _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor"]
+    model_cpu_offload_seq = "text_encoder->unet"
 
     def __init__(
         self,
@@ -179,48 +179,6 @@ def __init__(
         )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-
-        if self.text_encoder is not None:
-            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
-
-            # Accelerate will move the next model to the device _before_ calling the offload hook of the
-            # previous model. This will cause both models to be present on the device at the same time.
-            # IF uses T5 for its text encoder which is really large. We can manually call the offload
-            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
-            # the GPU.
-            self.text_encoder_offload_hook = hook
-
-        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
-
-        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
-        self.unet_offload_hook = hook
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
     def remove_all_hooks(self):
         if is_accelerate_available():
@@ -1048,9 +1006,8 @@ def __call__(
             # 11. Run safety checker
             image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, nsfw_detected, watermark_detected)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
index d54c9aedc6a5..466d386a959c 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
@@ -16,7 +16,6 @@
     BACKENDS_MAPPING,
     PIL_INTERPOLATION,
     is_accelerate_available,
-    is_accelerate_version,
     is_bs4_available,
     is_ftfy_available,
     logging,
@@ -130,6 +129,7 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin):
     )  # noqa
 
     _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
 
     def __init__(
         self,
@@ -171,48 +171,6 @@ def __init__(
         )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-
-        if self.text_encoder is not None:
-            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
-
-            # Accelerate will move the next model to the device _before_ calling the offload hook of the
-            # previous model. This will cause both models to be present on the device at the same time.
-            # IF uses T5 for its text encoder which is really large. We can manually call the offload
-            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
-            # the GPU.
-            self.text_encoder_offload_hook = hook
-
-        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
-
-        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
-        self.unet_offload_hook = hook
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
     def remove_all_hooks(self):
         if is_accelerate_available():
@@ -1049,9 +1007,8 @@ def __call__(
             # 9. Run safety checker
             image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, nsfw_detected, watermark_detected)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
index 1217d2d8398f..c36b138222b9 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
@@ -17,7 +17,6 @@
     BACKENDS_MAPPING,
     PIL_INTERPOLATION,
     is_accelerate_available,
-    is_accelerate_version,
     is_bs4_available,
     is_ftfy_available,
     logging,
@@ -132,6 +131,7 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
         r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
     )  # noqa
 
+    model_cpu_offload_seq = "text_encoder->unet"
     _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
 
     def __init__(
@@ -181,48 +181,6 @@ def __init__(
         )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-
-        if self.text_encoder is not None:
-            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
-
-            # Accelerate will move the next model to the device _before_ calling the offload hook of the
-            # previous model. This will cause both models to be present on the device at the same time.
-            # IF uses T5 for its text encoder which is really large. We can manually call the offload
-            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
-            # the GPU.
-            self.text_encoder_offload_hook = hook
-
-        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
-
-        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
-        self.unet_offload_hook = hook
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
     def remove_all_hooks(self):
         if is_accelerate_available():
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
index 8e1a6338eaed..13b65cb30ea8 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -16,7 +16,6 @@
 from ...utils import (
     BACKENDS_MAPPING,
     is_accelerate_available,
-    is_accelerate_version,
     is_bs4_available,
     is_ftfy_available,
     logging,
@@ -89,6 +88,7 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
     )  # noqa
 
     _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
 
     def __init__(
         self,
@@ -137,48 +137,6 @@ def __init__(
         )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-
-        if self.text_encoder is not None:
-            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
-
-            # Accelerate will move the next model to the device _before_ calling the offload hook of the
-            # previous model. This will cause both models to be present on the device at the same time.
-            # IF uses T5 for its text encoder which is really large. We can manually call the offload
-            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
-            # the GPU.
-            self.text_encoder_offload_hook = hook
-
-        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
-
-        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
-        self.unet_offload_hook = hook
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
     def remove_all_hooks(self):
         if is_accelerate_available():
@@ -904,9 +862,8 @@ def __call__(
             # 10. Run safety checker
             image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, nsfw_detected, watermark_detected)
diff --git a/src/diffusers/pipelines/dit/pipeline_dit.py b/src/diffusers/pipelines/dit/pipeline_dit.py
index 5f5b0b199168..022aa1202603 100644
--- a/src/diffusers/pipelines/dit/pipeline_dit.py
+++ b/src/diffusers/pipelines/dit/pipeline_dit.py
@@ -43,6 +43,7 @@ class DiTPipeline(DiffusionPipeline):
         scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
     """
+    model_cpu_offload_seq = "transformer->vae"
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 8545b8b42ff0..a715eb784617 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -22,8 +22,6 @@
 from ...models import UNet2DConditionModel, VQModel
 from ...schedulers import DDIMScheduler, DDPMScheduler
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -95,6 +93,8 @@ class KandinskyPipeline(DiffusionPipeline):
             MoVQ Decoder to generate the image from the latents.
     """
 
+    model_cpu_offload_seq = "text_encoder->unet->movq"
+
     def __init__(
         self,
         text_encoder: MultilingualCLIP,
@@ -228,31 +228,6 @@ def _encode_prompt(
 
         return prompt_embeds, text_encoder_hidden_states, text_mask
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.movq]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
index 7676809b4bdb..1c5a65722f35 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
@@ -142,6 +142,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
     """
 
     _load_connected_pipes = True
+    model_cpu_offload_seq = "text_encoder->unet->movq->prior_prior->prior_image_encoder->prior_text_encoder"
 
     def __init__(
         self,
@@ -191,16 +192,6 @@ def __init__(
     def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
         self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        self.prior_pipe.enable_model_cpu_offload()
-        self.decoder_pipe.enable_model_cpu_offload()
-
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
@@ -365,6 +356,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
     """
 
     _load_connected_pipes = True
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->" "text_encoder->unet->movq"
 
     def __init__(
         self,
@@ -414,16 +406,6 @@ def __init__(
     def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
         self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        self.prior_pipe.enable_model_cpu_offload()
-        self.decoder_pipe.enable_model_cpu_offload()
-
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
@@ -611,6 +593,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
     """
 
     _load_connected_pipes = True
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->" "text_encoder->unet->movq"
 
     def __init__(
         self,
@@ -660,16 +643,6 @@ def __init__(
     def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
         self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        self.prior_pipe.enable_model_cpu_offload()
-        self.decoder_pipe.enable_model_cpu_offload()
-
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 5013203049a1..7247adcf33f8 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -25,8 +25,6 @@
 from ...models import UNet2DConditionModel, VQModel
 from ...schedulers import DDIMScheduler
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -117,6 +115,8 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
             MoVQ image encoder and decoder
     """
 
+    model_cpu_offload_seq = "text_encoder->unet->movq"
+
     def __init__(
         self,
         text_encoder: MultilingualCLIP,
@@ -263,31 +263,6 @@ def _encode_prompt(
 
         return prompt_embeds, text_encoder_hidden_states, text_mask
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.movq]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     #  add_noise method to overwrite the one in schedule because it use a different beta schedule for adding noise vs sampling
     def add_noise(
         self,
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 4a920b5c3262..86618eb4e044 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -29,8 +29,6 @@
 from ...models import UNet2DConditionModel, VQModel
 from ...schedulers import DDIMScheduler
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -259,6 +257,8 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
             MoVQ image encoder and decoder
     """
 
+    model_cpu_offload_seq = "text_encoder->unet->movq"
+
     def __init__(
         self,
         text_encoder: MultilingualCLIP,
@@ -393,31 +393,6 @@ def _encode_prompt(
 
         return prompt_embeds, text_encoder_hidden_states, text_mask
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.movq]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index b6c031feac29..0e9eb9806dc1 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -24,8 +24,6 @@
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
     BaseOutput,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -149,6 +147,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
     """
 
     _exclude_from_cpu_offload = ["prior"]
+    model_cpu_offload_seq = "text_encoder->prior"
 
     def __init__(
         self,
@@ -395,35 +394,6 @@ def _encode_prompt(
 
         return prompt_embeds, text_encoder_hidden_states, text_mask
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.prior]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.prior_hook = hook
-
-        _, hook = cpu_offload_with_hook(self.image_encoder, device, prev_module_hook=self.prior_hook)
-
-        self.final_offload_hook = hook
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -557,8 +527,7 @@ def __call__(
         if negative_prompt is None:
             zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
 
-            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-                self.final_offload_hook.offload()
+            self.maybe_free_model_hooks
         else:
             image_embeddings, zero_embeds = image_embeddings.chunk(2)
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
index 2ff2d8b004ab..5d1cbb1af291 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
@@ -19,8 +19,6 @@
 from ...models import UNet2DConditionModel, VQModel
 from ...schedulers import DDPMScheduler
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -82,6 +80,8 @@ class KandinskyV22Pipeline(DiffusionPipeline):
             MoVQ Decoder to generate the image from the latents.
     """
 
+    model_cpu_offload_seq = "unet->movq"
+
     def __init__(
         self,
         unet: UNet2DConditionModel,
@@ -109,31 +109,6 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.unet, self.movq]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -273,9 +248,7 @@ def __call__(
         # post-processing
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        self.maybe_free_model_hooks()
 
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
index 4f8626a9bdba..4373f700d0b9 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
@@ -136,6 +136,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
             A image_processor to be used to preprocess image from clip.
     """
 
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
     _load_connected_pipes = True
 
     def __init__(
@@ -180,16 +181,6 @@ def __init__(
     def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
         self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        self.prior_pipe.enable_model_cpu_offload()
-        self.decoder_pipe.enable_model_cpu_offload()
-
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
@@ -351,6 +342,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
             A image_processor to be used to preprocess image from clip.
     """
 
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
     _load_connected_pipes = True
 
     def __init__(
@@ -588,6 +580,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
             A image_processor to be used to preprocess image from clip.
     """
 
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
     _load_connected_pipes = True
 
     def __init__(
@@ -632,16 +625,6 @@ def __init__(
     def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
         self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        self.prior_pipe.enable_model_cpu_offload()
-        self.decoder_pipe.enable_model_cpu_offload()
-
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
index ec82f4516042..cb0465c11ef9 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
@@ -19,10 +19,7 @@
 from ...models import UNet2DConditionModel, VQModel
 from ...schedulers import DDPMScheduler
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
-    replace_example_docstring,
 )
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -122,6 +119,8 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline):
             MoVQ Decoder to generate the image from the latents.
     """
 
+    model_cpu_offload_seq = "unet->movq"
+
     def __init__(
         self,
         unet: UNet2DConditionModel,
@@ -149,34 +148,7 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
-    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.unet, self.movq]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
@@ -327,9 +299,8 @@ def __call__(
         # post-processing
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
index 8a2deb52fbce..1f3edf4b5b49 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
@@ -22,10 +22,7 @@
 from ...models import UNet2DConditionModel, VQModel
 from ...schedulers import DDPMScheduler
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
-    replace_example_docstring,
 )
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -136,6 +133,8 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
             MoVQ Decoder to generate the image from the latents.
     """
 
+    model_cpu_offload_seq = "unet->movq"
+
     def __init__(
         self,
         unet: UNet2DConditionModel,
@@ -204,34 +203,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
 
         return latents
 
-    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.unet, self.movq]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
@@ -388,9 +360,8 @@ def __call__(
         # post-processing
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
index 9b0f576fa7d0..627857592abe 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
@@ -22,10 +22,7 @@
 from ...models import UNet2DConditionModel, VQModel
 from ...schedulers import DDPMScheduler
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
-    replace_example_docstring,
 )
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -110,6 +107,8 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
             MoVQ Decoder to generate the image from the latents.
     """
 
+    model_cpu_offload_seq = "unet->movq"
+
     def __init__(
         self,
         unet: UNet2DConditionModel,
@@ -177,34 +176,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
 
         return latents
 
-    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.unet, self.movq]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
@@ -352,9 +324,8 @@ def __call__(
         # post-processing
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
index 7320a62ef6e0..0106b96e22a5 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
@@ -26,10 +26,7 @@
 from ...models import UNet2DConditionModel, VQModel
 from ...schedulers import DDPMScheduler
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
-    replace_example_docstring,
 )
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -253,6 +250,8 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
             MoVQ Decoder to generate the image from the latents.
     """
 
+    model_cpu_offload_seq = "unet->movq"
+
     def __init__(
         self,
         unet: UNet2DConditionModel,
@@ -281,34 +280,7 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
-    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.unet, self.movq]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
@@ -505,9 +477,8 @@ def __call__(
         latents = mask_image[:1] * image[:1] + (1 - mask_image[:1]) * latents
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
index 943363dc7795..fa2935465fb5 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -7,8 +7,6 @@
 from ...models import PriorTransformer
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -106,6 +104,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
             A image_processor to be used to preprocess image from clip.
     """
 
+    model_cpu_offload_seq = "text_encoder->image_encoder->prior"
     _exclude_from_cpu_offload = ["prior"]
 
     def __init__(
@@ -355,35 +354,6 @@ def _encode_prompt(
 
         return prompt_embeds, text_encoder_hidden_states, text_mask
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.prior]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.prior_hook = hook
-
-        _, hook = cpu_offload_with_hook(self.image_encoder, device, prev_module_hook=self.prior_hook)
-
-        self.final_offload_hook = hook
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index f17f463b9bfe..5be00b04d6c2 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -7,8 +7,6 @@
 from ...models import PriorTransformer
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -122,6 +120,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `prior` to generate image embedding.
     """
 
+    model_cpu_offload_seq = "text_encoder->image_encoder->prior"
     _exclude_from_cpu_offload = ["prior"]
 
     def __init__(
@@ -394,35 +393,6 @@ def _encode_prompt(
 
         return prompt_embeds, text_encoder_hidden_states, text_mask
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.prior]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.prior_hook = hook
-
-        _, hook = cpu_offload_with_hook(self.image_encoder, device, prev_module_hook=self.prior_hook)
-
-        self.final_offload_hook = hook
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
index 4b4315a421e8..cedf9de01475 100644
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -49,6 +49,7 @@ class LDMTextToImagePipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
+    model_cpu_offload_seq = "bert->unet->vqvae"
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
index a891099f1aac..4ee07f4e056a 100644
--- a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
+++ b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
@@ -28,7 +28,13 @@
 
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import is_librosa_available, logging, replace_example_docstring
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    is_librosa_available,
+    logging,
+    replace_example_docstring,
+)
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 
@@ -391,6 +397,40 @@ def prepare_latents(self, batch_size, num_channels_latents, height, dtype, devic
         latents = latents * self.scheduler.init_noise_sigma
         return latents
 
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        model_sequence = [
+            self.text_encoder.text_model,
+            self.text_encoder.text_projection,
+            self.unet,
+            self.vae,
+            self.vocoder,
+            self.text_encoder,
+        ]
+
+        hook = None
+        for cpu_offloaded_model in model_sequence:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -578,6 +618,8 @@ def __call__(
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
+        self.maybe_free_model_hooks()
+
         # 8. Post-processing
         if not output_type == "latent":
             latents = 1 / self.vae.config.scaling_factor * latents
diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
index 383edae08e8f..fd589740f907 100644
--- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
+++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
@@ -169,6 +169,9 @@ class PaintByExamplePipeline(DiffusionPipeline):
     """
     # TODO: feature_extractor is required to encode initial images (if they are in PIL format),
     # we should give a descriptive message if the pipeline doesn't have one.
+
+    model_cpu_offload_seq = "unet->vae"
+    _exclude_from_cpu_offload = ["image_encoder"]
     _optional_components = ["safety_checker"]
 
     def __init__(
@@ -580,6 +583,8 @@ def __call__(
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
+        self.maybe_free_model_hooks()
+
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
             image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index b357fe6a8b29..71d32085afa4 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -495,6 +495,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
           pipeline to function (should be overridden by subclasses).
     """
     config_name = "model_index.json"
+    model_cpu_offload_seq = None
     _optional_components = []
     _exclude_from_cpu_offload = []
     _load_connected_pipes = False
@@ -1224,6 +1225,72 @@ def _execution_device(self):
                     return torch.device(module._hf_hook.execution_device)
         return self.device
 
+    def enable_model_cpu_offload(self, gpu_id: int = 0, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if self.model_cpu_offload_seq is None:
+            raise ValueError(
+                "Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
+            )
+
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            device_mod = getattr(torch, self.device.type, None)
+            if hasattr(device_mod, "empty_cache") and device_mod.is_available():
+                device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
+
+        self._all_hooks = []
+        hook = None
+        for model_str in self.model_cpu_offload_seq.split("->"):
+            model = all_model_components.pop(model_str)
+            if not isinstance(model, torch.nn.Module):
+                continue
+
+            _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
+            self._all_hooks.append(hook)
+
+        # CPU offload models that are not in the seq chain unless they are explicitly excluded
+        # these models will stay on CPU until maybe_free_model_hooks is called
+        # some models cannot be in the seq chain because they are iteratively called, such as controlnet
+        for name, model in all_model_components.items():
+            if not isinstance(model, torch.nn.Module):
+                continue
+
+            if name in self._exclude_from_cpu_offload:
+                model.to(device)
+            else:
+                _, hook = cpu_offload_with_hook(model, device)
+                self._all_hooks.append(hook)
+
+    def maybe_free_model_hooks(self):
+        r"""
+        TODO: Better doc string
+        """
+        if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
+            # `enable_model_cpu_offload` has not be called, so silently do nothing
+            return
+
+        for hook in self._all_hooks:
+            # offload model and remove hook from model
+            hook.offload()
+            hook.remove()
+
+        # make sure the model is in the same state as before calling it
+        self.enable_model_cpu_offload()
+
     def enable_sequential_cpu_offload(self, gpu_id: int = 0, device: Union[torch.device, str] = "cuda"):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
diff --git a/src/diffusers/pipelines/repaint/pipeline_repaint.py b/src/diffusers/pipelines/repaint/pipeline_repaint.py
index 5372c2431d52..bb4b3b4fdee9 100644
--- a/src/diffusers/pipelines/repaint/pipeline_repaint.py
+++ b/src/diffusers/pipelines/repaint/pipeline_repaint.py
@@ -89,6 +89,7 @@ class RePaintPipeline(DiffusionPipeline):
 
     unet: UNet2DModel
     scheduler: RePaintScheduler
+    model_cpu_offload_seq = "unet"
 
     def __init__(self, unet, scheduler):
         super().__init__()
diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
index c27b03968ec1..a12f983ca87d 100644
--- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
+++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
@@ -46,6 +46,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
 
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 7a6cd4589a0a..5a68f23b8c32 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -25,8 +25,6 @@
 from ...schedulers import HeunDiscreteScheduler
 from ...utils import (
     BaseOutput,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -99,6 +97,9 @@ class ShapEPipeline(DiffusionPipeline):
             rendering method.
     """
 
+    model_cpu_offload_seq = "text_encoder->prior"
+    _exclude_from_cpu_offload = ["shap_e_renderer"]
+
     def __init__(
         self,
         prior: PriorTransformer,
@@ -129,34 +130,6 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.prior, self.shap_e_renderer]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     def _encode_prompt(
         self,
         prompt,
@@ -318,6 +291,9 @@ def __call__(
                 sample=latents,
             ).prev_sample
 
+        # Offload all models
+        self.maybe_free_model_hooks()
+
         if output_type not in ["np", "pil", "latent", "mesh"]:
             raise ValueError(
                 f"Only the output types `pil`, `np`, `latent` and `mesh` are supported not output_type={output_type}"
@@ -352,10 +328,6 @@ def __call__(
             if output_type == "pil":
                 images = [self.numpy_to_pil(image) for image in images]
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
-
         if not return_dict:
             return (images,)
 
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index a8ef7aa09027..5b24d430d015 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -98,6 +98,9 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
             rendering method.
     """
 
+    model_cpu_offload_seq = "image_encoder->prior"
+    _exclude_from_cpu_offload = ["shap_e_renderer"]
+
     def __init__(
         self,
         prior: PriorTransformer,
@@ -309,9 +312,8 @@ def __call__(
             if output_type == "pil":
                 images = [self.numpy_to_pil(image) for image in images]
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (images,)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index 6896ef94a3cf..7ed335ea8f7d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -21,8 +21,6 @@
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from diffusers.utils import is_accelerate_available, is_accelerate_version
-
 from ...configuration_utils import FrozenDict
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
@@ -150,6 +148,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -228,35 +227,6 @@ def __init__(
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index a84b316bbf62..2369a02a10f1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -27,8 +27,6 @@
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -101,6 +99,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -222,34 +221,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     def _encode_prompt(
         self,
         prompt,
@@ -745,9 +716,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
index d64e02e8ecd0..e62d80a25c53 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
@@ -189,6 +189,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 3be87fe641f6..e8fcd39202dc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -85,6 +85,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
index 13522fa780ca..f641ea54e2d6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
@@ -32,8 +32,6 @@
     PIL_INTERPOLATION,
     BaseOutput,
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -272,6 +270,7 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor", "inverse_scheduler"]
 
     def __init__(
@@ -400,35 +399,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -1070,9 +1040,8 @@ def generate_mask(
         if output_type == "pil":
             mask_image = self.image_processor.numpy_to_pil(mask_image)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         return mask_image
 
@@ -1305,9 +1274,8 @@ def invert(
         if decode_latents and output_type == "pil":
             image = self.image_processor.numpy_to_pil(image)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (latents, image)
@@ -1548,9 +1516,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py
index 7748896524c0..57ca7ccbcc18 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py
@@ -28,8 +28,6 @@
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -125,6 +123,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
     _optional_components = ["safety_checker", "feature_extractor"]
+    model_cpu_offload_seq = "text_encoder->unet->vae"
 
     def __init__(
         self,
@@ -197,34 +196,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
index 01cef5438a1e..e75c88f7e18f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
@@ -33,8 +33,6 @@
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -182,6 +180,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline):
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -261,34 +260,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
     def encode_prompt(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index 328e7165a188..6caa87e369d6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -65,6 +65,7 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
     # TODO: feature_extractor is required to encode images (if they are in PIL format),
     # we should give a descriptive message if the pipeline doesn't have one.
     _optional_components = ["safety_checker"]
+    model_cpu_offload_seq = "image_encoder->unet->vae"
 
     def __init__(
         self,
@@ -392,6 +393,8 @@ def __call__(
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
+        self.maybe_free_model_hooks()
+
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
             image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 13d971de2844..3f447ccad95d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -30,8 +30,6 @@
 from ...utils import (
     PIL_INTERPOLATION,
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -129,6 +127,7 @@ class StableDiffusionImg2ImgPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -221,35 +220,6 @@ def __init__(
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -775,9 +745,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index a01442df5ce8..be52e27a0ec0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -27,7 +27,7 @@
 from ...models import AsymmetricAutoencoderKL, AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils import deprecate, logging
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -192,6 +192,7 @@ class StableDiffusionInpaintPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -292,35 +293,6 @@ def __init__(
         )
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -1064,9 +1036,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 3be6fc93e970..0fe05966f260 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -27,7 +27,7 @@
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils import PIL_INTERPOLATION, deprecate, logging
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -115,6 +115,7 @@ class StableDiffusionInpaintPipelineLegacy(
         feature_extractor ([`CLIPImageProcessor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["feature_extractor"]
 
     def __init__(
@@ -214,35 +215,6 @@ def __init__(
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -761,9 +733,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index 8ed36f771db9..4c8a7c6cc176 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -24,7 +24,7 @@
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils import PIL_INTERPOLATION, deprecate, logging
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -89,6 +89,7 @@ class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversion
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -392,44 +393,14 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     def _encode_prompt(
         self,
         prompt,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
index f4509cd4a960..6aca8a4feb8c 100755
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -24,7 +24,7 @@
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import LMSDiscreteScheduler
-from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils import deprecate, logging
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -80,6 +80,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
         feature_extractor ([`CLIPImageProcessor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -128,35 +129,6 @@ def set_scheduler(self, scheduler_type: str):
         sampling = getattr(library, "sampling")
         self.sampler = getattr(sampling, scheduler_type)
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -630,9 +602,8 @@ def model_fn(x, t):
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
index 4141b65f5096..3cdc48e6c28b 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -78,6 +78,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline):
         scheduler ([`SchedulerMixin`]):
             A [`EulerDiscreteScheduler`] to be used in combination with `unet` to denoise the encoded image latents.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
index 3400497670c9..ad1c39607672 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
@@ -29,8 +29,6 @@
 from ...utils import (
     BaseOutput,
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -114,6 +112,7 @@ class StableDiffusionLDM3DPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -191,35 +190,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -696,9 +666,8 @@ def __call__(
 
         rgb, depth = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return ((rgb, depth), has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
index a92515cfb4a5..5ef8ba48d4f5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
@@ -66,6 +66,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa
         with_augs ([`list`]):
             Textual augmentations to apply while editing the text-to-image model. Set to `[]` for no augmentations.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -787,9 +788,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
index 0956bfefa372..811cf69cabf5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
@@ -78,6 +78,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py
index cf597ac062bf..735951723dda 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py
@@ -25,8 +25,6 @@
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -95,6 +93,7 @@ class StableDiffusionParadigmsPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -175,35 +174,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -803,9 +773,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index be3ffa4071eb..5a55108575b4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -39,8 +39,6 @@
     PIL_INTERPOLATION,
     BaseOutput,
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -309,6 +307,7 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
             Whether the pipeline requires a safety checker. We recommend setting it to True if you're using the
             pipeline publicly.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = [
         "safety_checker",
         "feature_extractor",
@@ -365,30 +364,6 @@ def __init__(
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        hook = None
-        for cpu_offloaded_model in [self.vae, self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -1081,9 +1056,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
@@ -1286,9 +1260,8 @@ def invert(
         image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
         image = self.image_processor.postprocess(image, output_type=output_type)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (inverted_latents, image)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
index 7580c11936c0..204b5bd10986 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
@@ -117,6 +117,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin)
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 4e5e77a5e2db..855d178ae541 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -32,7 +32,7 @@
 )
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
-from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils import deprecate, logging
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -90,6 +90,7 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["watermarker", "safety_checker", "feature_extractor"]
 
     def __init__(
@@ -140,32 +141,6 @@ def __init__(
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, resample="bicubic")
         self.register_to_config(max_noise_level=max_noise_level)
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            if cpu_offloaded_model is not None:
-                _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is not None:
             feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
index 2ac9a52570ca..917b9fef0ead 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
@@ -27,8 +27,6 @@
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -92,6 +90,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
     """
 
     _exclude_from_cpu_offload = ["prior", "image_normalizer"]
+    model_cpu_offload_seq = "text_encoder->prior_text_encoder->unet->vae"
 
     # prior components
     prior_tokenizer: CLIPTokenizer
@@ -164,31 +163,6 @@ def disable_vae_slicing(self):
         """
         self.vae.disable_slicing()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.prior_text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._encode_prompt with _encode_prompt->_encode_prior_prompt, tokenizer->prior_tokenizer, text_encoder->prior_text_encoder
     def _encode_prior_prompt(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
index dae0846ea64b..be837564fddc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -19,15 +19,13 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
-from diffusers.utils.import_utils import is_accelerate_available
-
 from ...image_processor import VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.embeddings import get_timestep_embedding
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, is_accelerate_version, logging, replace_example_docstring
+from ...utils import deprecate, logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
@@ -94,6 +92,7 @@ class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
     """
 
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
     _exclude_from_cpu_offload = ["image_normalizer"]
 
     # image encoding components
@@ -161,31 +160,6 @@ def disable_vae_slicing(self):
         """
         self.vae.disable_slicing()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.image_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
index 88b6e29f4b21..40326c1c035b 100644
--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -47,6 +47,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
 
+    model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 81c783bdfd2f..10e966b248ab 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -129,6 +129,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoad
             watermark output images. If not defined, it will default to True if the package is installed, otherwise no
             watermarker will be used.
     """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
 
     def __init__(
         self,
@@ -198,36 +199,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     def encode_prompt(
         self,
         prompt: str,
@@ -900,17 +871,16 @@ def __call__(
                 self.vae.to(dtype=torch.float16)
         else:
             image = latents
-            return StableDiffusionXLPipelineOutput(images=image)
 
-        # apply watermark if available
-        if self.watermark is not None:
-            image = self.watermark.apply_watermark(image)
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
 
-        image = self.image_processor.postprocess(image, output_type=output_type)
+            image = self.image_processor.postprocess(image, output_type=output_type)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image,)
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 5af3b07f28a3..8e26a2ad067d 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -134,6 +134,8 @@ class StableDiffusionXLImg2ImgPipeline(
             watermark output images. If not defined, it will default to True if the package is installed, otherwise no
             watermarker will be used.
     """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+
     _optional_components = ["tokenizer", "text_encoder"]
 
     def __init__(
@@ -205,36 +207,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
     def encode_prompt(
         self,
@@ -1057,9 +1029,8 @@ def denoising_value_valid(dnv):
 
         image = self.image_processor.postprocess(image, output_type=output_type)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image,)
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index c47b53b53bef..6fdc688d9eae 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -280,6 +280,8 @@ class StableDiffusionXLInpaintPipeline(
             watermark output images. If not defined, it will default to True if the package is installed, otherwise no
             watermarker will be used.
     """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+
     _optional_components = ["tokenizer", "text_encoder"]
 
     def __init__(
@@ -354,37 +356,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
     def encode_prompt(
         self,
@@ -1377,9 +1348,8 @@ def denoising_value_valid(dnv):
 
         image = self.image_processor.postprocess(image, output_type=output_type)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image,)
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
index c283f5bade68..614cc0e6477c 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -32,8 +32,6 @@
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     is_invisible_watermark_available,
     logging,
     replace_example_docstring,
@@ -143,6 +141,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
             watermark output images. If not defined, it will default to True if the package is installed, otherwise no
             watermarker will be used.
     """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
 
     def __init__(
         self,
@@ -211,38 +210,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
     def encode_prompt(
         self,
         prompt: str,
@@ -965,9 +932,8 @@ def __call__(
 
         image = self.image_processor.postprocess(image, output_type=output_type)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image,)
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
index 8884c94eb72e..4120d5f9dfe6 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
@@ -30,8 +30,6 @@
     PIL_INTERPOLATION,
     BaseOutput,
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -151,6 +149,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline):
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->adapter->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
@@ -217,34 +216,6 @@ def disable_vae_slicing(self):
         """
         self.vae.disable_slicing()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.adapter, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -815,9 +786,8 @@ def __call__(
             # 9. Run safety checker
             image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
index 5116dd8f7b52..4bf0e3311865 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -35,8 +35,6 @@
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     PIL_INTERPOLATION,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -159,6 +157,7 @@ class StableDiffusionXLAdapterPipeline(
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
 
     def __init__(
         self,
@@ -222,37 +221,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
     def encode_prompt(
         self,
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
index 678c2fbff438..a8395a5e86c8 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -25,8 +25,6 @@
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -95,6 +93,7 @@ class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lora
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
 
     def __init__(
         self,
@@ -148,31 +147,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -678,9 +652,8 @@ def __call__(
         else:
             video = tensor2vid(video_tensor)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (video,)
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
index b7a4bfdd8859..1e1b30e18fcb 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
@@ -26,8 +26,6 @@
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -157,6 +155,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
 
     def __init__(
         self,
@@ -210,31 +209,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.vae, self.unet]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -753,9 +727,8 @@ def __call__(
         else:
             video = tensor2vid(video_tensor)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (video,)
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
index f7157e43c0de..48d6d72259c6 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
@@ -635,9 +635,8 @@ def __call__(
             # Run safety checker
             image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
 
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
+        # Offload all models
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image, has_nsfw_concept)
diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip.py b/src/diffusers/pipelines/unclip/pipeline_unclip.py
index 7e8dc22f6ca2..c4a25c865d88 100644
--- a/src/diffusers/pipelines/unclip/pipeline_unclip.py
+++ b/src/diffusers/pipelines/unclip/pipeline_unclip.py
@@ -76,6 +76,8 @@ class UnCLIPPipeline(DiffusionPipeline):
     decoder_scheduler: UnCLIPScheduler
     super_res_scheduler: UnCLIPScheduler
 
+    model_cpu_offload_seq = "text_encoder->text_proj->decoder->super_res_first->super_res_last"
+
     def __init__(
         self,
         prior: PriorTransformer,
diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
index 8ec917f9e297..de3b23c97ecd 100644
--- a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
+++ b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
@@ -77,6 +77,7 @@ class UnCLIPImageVariationPipeline(DiffusionPipeline):
 
     decoder_scheduler: UnCLIPScheduler
     super_res_scheduler: UnCLIPScheduler
+    model_cpu_offload_seq = "text_encoder->image_encoder->text_proj->decoder->super_res_first->super_res_last"
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
index 2fcb89734089..5d06d00e2a30 100644
--- a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
+++ b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
@@ -103,6 +103,9 @@ class UniDiffuserPipeline(DiffusionPipeline):
             original UniDiffuser paper uses the [`DPMSolverMultistepScheduler`] scheduler.
     """
 
+    # TODO: support for moving submodules for components with enable_model_cpu_offload
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae->text_decoder"
+
     def __init__(
         self,
         vae: AutoencoderKL,
@@ -173,7 +176,15 @@ def enable_model_cpu_offload(self, gpu_id=0):
             torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
 
         hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae, self.image_encoder, self.text_decoder]:
+        for cpu_offloaded_model in [
+            self.text_encoder.text_model,
+            self.image_encoder,
+            self.unet,
+            self.vae,
+            self.text_decoder.encode_prefix,
+            self.text_decoder.decode_prefix,
+            self.text_decoder,
+        ]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
         if self.safety_checker is not None:
@@ -1344,6 +1355,8 @@ def __call__(
                 for output, length in zip(output_list, seq_lengths)
             ]
 
+        self.maybe_free_model_hooks()
+
         # 10. Convert to PIL
         if output_type == "pil" and gen_image is not None:
             gen_image = self.numpy_to_pil(gen_image)
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
index cbb91e8a9e9a..781d19809124 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -58,6 +58,8 @@ class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
+    model_cpu_offload_seq = "bert->unet->vqvae"
+
     tokenizer: CLIPTokenizer
     image_feature_extractor: CLIPImageProcessor
     text_encoder: CLIPTextModelWithProjection
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
index f06aa4b45d4d..7d105ad1fb38 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -52,6 +52,8 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
+    model_cpu_offload_seq = "bert->unet->vqvae"
+
     image_feature_extractor: CLIPImageProcessor
     image_encoder: CLIPVisionModelWithProjection
     image_unet: UNet2DConditionModel
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
index f2d3aebce2b6..9c9b854b8334 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -51,6 +51,8 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
+    model_cpu_offload_seq = "bert->unet->vqvae"
+
     tokenizer: CLIPTokenizer
     image_feature_extractor: CLIPImageProcessor
     text_encoder: CLIPTextModelWithProjection
diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py
index b37fe4dcec48..5134fbaa6042 100644
--- a/tests/pipelines/audioldm2/test_audioldm2.py
+++ b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -44,7 +44,7 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from diffusers.utils import is_accelerate_available, is_accelerate_version, is_xformers_available
+from diffusers.utils import is_xformers_available
 from diffusers.utils.testing_utils import enable_full_determinism, slow, torch_device
 
 from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
@@ -491,26 +491,6 @@ def test_to_dtype(self):
         model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
         self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
 
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
-        reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
-    )
-    def test_model_cpu_offload(self, expected_max_diff=2e-4):
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDM2Pipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output_without_offload = audioldm_pipe(**inputs)[0]
-
-        audioldm_pipe.enable_model_cpu_offload()
-        inputs = self.get_dummy_inputs(torch_device)
-        output_with_offload = audioldm_pipe(**inputs)[0]
-
-        max_diff = np.abs(output_with_offload - output_without_offload).max()
-        self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
-
 
 @slow
 class AudioLDM2PipelineSlowTests(unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
index fcd6ff8d77f3..a8f489012bf7 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
@@ -163,8 +163,8 @@ def test_inference(self):
         max_diff = np.abs(image_slice.flatten() - expected_slice).max()
         self.assertLessEqual(max_diff, 1e-3)
 
-    def test_cpu_offload_forward_pass(self):
-        super().test_cpu_offload_forward_pass(expected_max_diff=5e-4)
+    def test_sequential_cpu_offload_forward_pass(self):
+        super().test_sequential_cpu_offload_forward_pass(expected_max_diff=5e-4)
 
     def test_inference_batch_consistent(self):
         # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
index e20438a2af6b..75199b55ee21 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
@@ -181,8 +181,8 @@ def test_inference(self):
     def test_attention_slicing_forward_pass(self):
         super().test_attention_slicing_forward_pass(expected_max_diff=7e-3)
 
-    def test_cpu_offload_forward_pass(self):
-        super().test_cpu_offload_forward_pass(expected_max_diff=3e-3)
+    def test_sequential_cpu_offload_forward_pass(self):
+        super().test_sequential_cpu_offload_forward_pass(expected_max_diff=3e-3)
 
     def test_dict_tuple_outputs_equivalent(self):
         super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3)
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index c70ccc635780..13866f997054 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -717,7 +717,7 @@ def _test_attention_slicing_forward_pass(
         torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"),
         reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
     )
-    def test_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
+    def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
         for component in pipe.components.values():
@@ -726,11 +726,39 @@ def test_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
-        inputs = self.get_dummy_inputs(torch_device)
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
         output_without_offload = pipe(**inputs)[0]
 
         pipe.enable_sequential_cpu_offload()
-        inputs = self.get_dummy_inputs(torch_device)
+
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_offload = pipe(**inputs)[0]
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
+        reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
+    )
+    def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
+        generator_device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_offload = pipe(**inputs)[0]
+
+        pipe.enable_model_cpu_offload()
+        inputs = self.get_dummy_inputs(generator_device)
         output_with_offload = pipe(**inputs)[0]
 
         max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()

From d82157b3ce7884308eaa9f19b7362f53505c9824 Mon Sep 17 00:00:00 2001
From: zhiqiang <zhiqiang@canva.com>
Date: Tue, 12 Sep 2023 01:45:58 +0800
Subject: [PATCH 16/37] [Bug Fix] Should pass the dtype instead of torch_dtype 
 (#4917)

.
---
 src/diffusers/loaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 1de899cad927..52c140a6782a 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -2381,7 +2381,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             vae.load_state_dict(converted_vae_checkpoint)
 
         if torch_dtype is not None:
-            vae.to(torch_dtype=torch_dtype)
+            vae.to(dtype=torch_dtype)
 
         return vae
 

From 18b7264bd0e2d30ef2f23d1ce53c88256d39d3bd Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 12 Sep 2023 11:05:53 +0200
Subject: [PATCH 17/37] [Utils] Correct custom init sort (#4967)

* [Utils] Correct custom init sort

* [Utils] Correct custom init sort

* [Utils] Correct custom init sort

* add type checking

* fix custom init sort

* fix test

* fix tests

---------

Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
---
 src/diffusers/models/__init__.py              |  31 ++-
 src/diffusers/pipelines/__init__.py           | 251 +++++++++++++++---
 .../pipelines/alt_diffusion/__init__.py       |  39 ++-
 .../pipelines/audio_diffusion/__init__.py     |  29 +-
 src/diffusers/pipelines/audioldm/__init__.py  |  36 ++-
 src/diffusers/pipelines/audioldm2/__init__.py |  21 +-
 .../pipelines/consistency_models/__init__.py  |  22 +-
 .../pipelines/controlnet/__init__.py          |  56 ++--
 .../pipelines/dance_diffusion/__init__.py     |  24 +-
 src/diffusers/pipelines/ddim/__init__.py      |  23 +-
 src/diffusers/pipelines/ddpm/__init__.py      |  22 +-
 .../pipelines/deepfloyd_if/__init__.py        |  75 ++++--
 src/diffusers/pipelines/dit/__init__.py       |  22 +-
 src/diffusers/pipelines/kandinsky/__init__.py |  43 ++-
 .../pipelines/kandinsky2_2/__init__.py        |  45 +++-
 .../pipelines/latent_diffusion/__init__.py    |  34 ++-
 .../latent_diffusion_uncond/__init__.py       |  23 +-
 src/diffusers/pipelines/musicldm/__init__.py  |  34 ++-
 .../pipelines/paint_by_example/__init__.py    |  34 ++-
 src/diffusers/pipelines/pndm/__init__.py      |  24 +-
 src/diffusers/pipelines/repaint/__init__.py   |  22 +-
 .../pipelines/score_sde_ve/__init__.py        |  22 +-
 .../semantic_stable_diffusion/__init__.py     |  34 ++-
 src/diffusers/pipelines/shap_e/__init__.py    |  45 +++-
 .../spectrogram_diffusion/__init__.py         |  54 ++--
 .../pipelines/stable_diffusion/__init__.py    | 145 ++++++++--
 .../stable_diffusion_safe/__init__.py         |  80 +++---
 .../stable_diffusion_safe/pipeline_output.py  |  34 +++
 .../pipelines/stable_diffusion_xl/__init__.py |  37 ++-
 .../stochastic_karras_ve/__init__.py          |  22 +-
 .../pipelines/t2i_adapter/__init__.py         |  32 ++-
 .../text_to_video_synthesis/__init__.py       |  32 ++-
 src/diffusers/pipelines/unclip/__init__.py    |  35 ++-
 .../pipelines/unidiffuser/__init__.py         |  39 ++-
 .../pipelines/versatile_diffusion/__init__.py |  41 ++-
 .../pipelines/vq_diffusion/__init__.py        |  32 ++-
 .../pipelines/wuerstchen/__init__.py          |  37 ++-
 src/diffusers/schedulers/__init__.py          | 121 +++++++--
 utils/custom_init_isort.py                    | 117 ++++++--
 39 files changed, 1371 insertions(+), 498 deletions(-)
 create mode 100644 src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index fc60ff845ccf..75ddb21fb15d 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import TYPE_CHECKING
+
 from ..utils import _LazyModule, is_flax_available, is_torch_available
 
 
@@ -40,7 +42,32 @@
     _import_structure["unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
     _import_structure["vae_flax"] = ["FlaxAutoencoderKL"]
 
-import sys
 
+if TYPE_CHECKING:
+    if is_torch_available():
+        from .adapter import MultiAdapter, T2IAdapter
+        from .autoencoder_asym_kl import AsymmetricAutoencoderKL
+        from .autoencoder_kl import AutoencoderKL
+        from .autoencoder_tiny import AutoencoderTiny
+        from .controlnet import ControlNetModel
+        from .dual_transformer_2d import DualTransformer2DModel
+        from .modeling_utils import ModelMixin
+        from .prior_transformer import PriorTransformer
+        from .t5_film_transformer import T5FilmDecoder
+        from .transformer_2d import Transformer2DModel
+        from .transformer_temporal import TransformerTemporalModel
+        from .unet_1d import UNet1DModel
+        from .unet_2d import UNet2DModel
+        from .unet_2d_condition import UNet2DConditionModel
+        from .unet_3d_condition import UNet3DConditionModel
+        from .vq_model import VQModel
+
+    if is_flax_available():
+        from .controlnet_flax import FlaxControlNetModel
+        from .unet_2d_condition_flax import FlaxUNet2DConditionModel
+        from .vae_flax import FlaxAutoencoderKL
+
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index b237adae7d54..8bf0a98de893 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ..utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -13,8 +15,8 @@
 
 
 # These modules contain pipelines from multiple libraries/frameworks
-_import_structure = {"stable_diffusion": [], "latent_diffusion": [], "controlnet": []}
 _dummy_objects = {}
+_import_structure = {"stable_diffusion": [], "latent_diffusion": [], "controlnet": []}
 
 try:
     if not is_torch_available():
@@ -23,7 +25,6 @@
     from ..utils import dummy_pt_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_pt_objects))
-
 else:
     _import_structure["auto_pipeline"] = [
         "AutoPipelineForImage2Image",
@@ -42,7 +43,6 @@
     _import_structure["repaint"] = ["RePaintPipeline"]
     _import_structure["score_sde_ve"] = ["ScoreSdeVePipeline"]
     _import_structure["stochastic_karras_ve"] = ["KarrasVePipeline"]
-
 try:
     if not (is_torch_available() and is_librosa_available()):
         raise OptionalDependencyNotAvailable()
@@ -50,10 +50,8 @@
     from ..utils import dummy_torch_and_librosa_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_librosa_objects))
-
 else:
     _import_structure["audio_diffusion"] = ["AudioDiffusionPipeline", "Mel"]
-
 try:
     if not (is_torch_available() and is_transformers_available()):
         raise OptionalDependencyNotAvailable()
@@ -61,11 +59,14 @@
     from ..utils import dummy_torch_and_transformers_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-
 else:
     _import_structure["alt_diffusion"] = ["AltDiffusionImg2ImgPipeline", "AltDiffusionPipeline"]
     _import_structure["audioldm"] = ["AudioLDMPipeline"]
-    _import_structure["audioldm2"] = ["AudioLDM2Pipeline", "AudioLDM2ProjectionModel", "AudioLDM2UNet2DConditionModel"]
+    _import_structure["audioldm2"] = [
+        "AudioLDM2Pipeline",
+        "AudioLDM2ProjectionModel",
+        "AudioLDM2UNet2DConditionModel",
+    ]
     _import_structure["controlnet"].extend(
         [
             "StableDiffusionControlNetImg2ImgPipeline",
@@ -117,6 +118,8 @@
             "StableDiffusionDepth2ImgPipeline",
             "StableDiffusionDiffEditPipeline",
             "StableDiffusionGLIGENPipeline",
+            "StableDiffusionGLIGENPipeline",
+            "StableDiffusionGLIGENTextImagePipeline",
             "StableDiffusionImageVariationPipeline",
             "StableDiffusionImg2ImgPipeline",
             "StableDiffusionInpaintPipeline",
@@ -133,8 +136,6 @@
             "StableDiffusionUpscalePipeline",
             "StableUnCLIPImg2ImgPipeline",
             "StableUnCLIPPipeline",
-            "StableDiffusionGLIGENTextImagePipeline",
-            "StableDiffusionGLIGENPipeline",
         ]
     )
     _import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
@@ -169,8 +170,6 @@
         "WuerstchenDecoderPipeline",
         "WuerstchenPriorPipeline",
     ]
-
-
 try:
     if not is_onnx_available():
         raise OptionalDependencyNotAvailable()
@@ -178,10 +177,8 @@
     from ..utils import dummy_onnx_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_onnx_objects))
-
 else:
     _import_structure["onnx_utils"] = ["OnnxRuntimeModel"]
-
 try:
     if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
         raise OptionalDependencyNotAvailable()
@@ -189,7 +186,6 @@
     from ..utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_onnx_objects))
-
 else:
     _import_structure["stable_diffusion"].extend(
         [
@@ -201,7 +197,6 @@
             "StableDiffusionOnnxPipeline",
         ]
     )
-
 try:
     if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
         raise OptionalDependencyNotAvailable()
@@ -209,10 +204,8 @@
     from ..utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
-
 else:
     _import_structure["stable_diffusion"].extend(["StableDiffusionKDiffusionPipeline"])
-
 try:
     if not is_flax_available():
         raise OptionalDependencyNotAvailable()
@@ -220,11 +213,8 @@
     from ..utils import dummy_flax_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_flax_objects))
-
 else:
     _import_structure["pipeline_flax_utils"] = ["FlaxDiffusionPipeline"]
-
-
 try:
     if not (is_flax_available() and is_transformers_available()):
         raise OptionalDependencyNotAvailable()
@@ -232,7 +222,6 @@
     from ..utils import dummy_flax_and_transformers_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
-
 else:
     _import_structure["controlnet"].extend(["FlaxStableDiffusionControlNetPipeline"])
     _import_structure["stable_diffusion"].extend(
@@ -249,19 +238,217 @@
     from ..utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
-
 else:
     _import_structure["spectrogram_diffusion"] = ["MidiProcessor", "SpectrogramDiffusionPipeline"]
 
+if TYPE_CHECKING:
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_pt_objects import *  # noqa F403
+
+    else:
+        from .auto_pipeline import AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image
+        from .consistency_models import ConsistencyModelPipeline
+        from .dance_diffusion import DanceDiffusionPipeline
+        from .ddim import DDIMPipeline
+        from .ddpm import DDPMPipeline
+        from .dit import DiTPipeline
+        from .latent_diffusion import LDMSuperResolutionPipeline
+        from .latent_diffusion_uncond import LDMPipeline
+        from .pipeline_utils import AudioPipelineOutput, DiffusionPipeline, ImagePipelineOutput
+        from .pndm import PNDMPipeline
+        from .repaint import RePaintPipeline
+        from .score_sde_ve import ScoreSdeVePipeline
+        from .stochastic_karras_ve import KarrasVePipeline
+
+    try:
+        if not (is_torch_available() and is_librosa_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_librosa_objects import *
+    else:
+        from .audio_diffusion import AudioDiffusionPipeline, Mel
+
+    try:
+        if not (is_torch_available() and is_transformers_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline
+        from .audioldm import AudioLDMPipeline
+        from .audioldm2 import AudioLDM2Pipeline, AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
+        from .controlnet import (
+            StableDiffusionControlNetImg2ImgPipeline,
+            StableDiffusionControlNetInpaintPipeline,
+            StableDiffusionControlNetPipeline,
+            StableDiffusionXLControlNetImg2ImgPipeline,
+            StableDiffusionXLControlNetInpaintPipeline,
+            StableDiffusionXLControlNetPipeline,
+        )
+        from .deepfloyd_if import (
+            IFImg2ImgPipeline,
+            IFImg2ImgSuperResolutionPipeline,
+            IFInpaintingPipeline,
+            IFInpaintingSuperResolutionPipeline,
+            IFPipeline,
+            IFSuperResolutionPipeline,
+        )
+        from .kandinsky import (
+            KandinskyCombinedPipeline,
+            KandinskyImg2ImgCombinedPipeline,
+            KandinskyImg2ImgPipeline,
+            KandinskyInpaintCombinedPipeline,
+            KandinskyInpaintPipeline,
+            KandinskyPipeline,
+            KandinskyPriorPipeline,
+        )
+        from .kandinsky2_2 import (
+            KandinskyV22CombinedPipeline,
+            KandinskyV22ControlnetImg2ImgPipeline,
+            KandinskyV22ControlnetPipeline,
+            KandinskyV22Img2ImgCombinedPipeline,
+            KandinskyV22Img2ImgPipeline,
+            KandinskyV22InpaintCombinedPipeline,
+            KandinskyV22InpaintPipeline,
+            KandinskyV22Pipeline,
+            KandinskyV22PriorEmb2EmbPipeline,
+            KandinskyV22PriorPipeline,
+        )
+        from .latent_diffusion import LDMTextToImagePipeline
+        from .musicldm import MusicLDMPipeline
+        from .paint_by_example import PaintByExamplePipeline
+        from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
+        from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
+        from .stable_diffusion import (
+            CycleDiffusionPipeline,
+            StableDiffusionAttendAndExcitePipeline,
+            StableDiffusionDepth2ImgPipeline,
+            StableDiffusionDiffEditPipeline,
+            StableDiffusionGLIGENPipeline,
+            StableDiffusionGLIGENTextImagePipeline,
+            StableDiffusionImageVariationPipeline,
+            StableDiffusionImg2ImgPipeline,
+            StableDiffusionInpaintPipeline,
+            StableDiffusionInpaintPipelineLegacy,
+            StableDiffusionInstructPix2PixPipeline,
+            StableDiffusionLatentUpscalePipeline,
+            StableDiffusionLDM3DPipeline,
+            StableDiffusionModelEditingPipeline,
+            StableDiffusionPanoramaPipeline,
+            StableDiffusionParadigmsPipeline,
+            StableDiffusionPipeline,
+            StableDiffusionPix2PixZeroPipeline,
+            StableDiffusionSAGPipeline,
+            StableDiffusionUpscalePipeline,
+            StableUnCLIPImg2ImgPipeline,
+            StableUnCLIPPipeline,
+        )
+        from .stable_diffusion_safe import StableDiffusionPipelineSafe
+        from .stable_diffusion_xl import (
+            StableDiffusionXLImg2ImgPipeline,
+            StableDiffusionXLInpaintPipeline,
+            StableDiffusionXLInstructPix2PixPipeline,
+            StableDiffusionXLPipeline,
+        )
+        from .t2i_adapter import StableDiffusionAdapterPipeline, StableDiffusionXLAdapterPipeline
+        from .text_to_video_synthesis import (
+            TextToVideoSDPipeline,
+            TextToVideoZeroPipeline,
+            VideoToVideoSDPipeline,
+        )
+        from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
+        from .unidiffuser import (
+            ImageTextPipelineOutput,
+            UniDiffuserModel,
+            UniDiffuserPipeline,
+            UniDiffuserTextDecoder,
+        )
+        from .versatile_diffusion import (
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+        )
+        from .vq_diffusion import VQDiffusionPipeline
+        from .wuerstchen import (
+            WuerstchenCombinedPipeline,
+            WuerstchenDecoderPipeline,
+            WuerstchenPriorPipeline,
+        )
+
+        try:
+            if not is_onnx_available():
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_onnx_objects import *  # noqa F403
+
+        else:
+            from .onnx_utils import OnnxRuntimeModel
+
+        try:
+            if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_torch_and_transformers_and_onnx_objects import *
+        else:
+            from .stable_diffusion import (
+                OnnxStableDiffusionImg2ImgPipeline,
+                OnnxStableDiffusionInpaintPipeline,
+                OnnxStableDiffusionInpaintPipelineLegacy,
+                OnnxStableDiffusionPipeline,
+                OnnxStableDiffusionUpscalePipeline,
+                StableDiffusionOnnxPipeline,
+            )
+
+        try:
+            if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_torch_and_transformers_and_k_diffusion_objects import *
+        else:
+            from .stable_diffusion import StableDiffusionKDiffusionPipeline
+
+        try:
+            if not is_flax_available():
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_flax_objects import *  # noqa F403
+        else:
+            from .pipeline_flax_utils import FlaxDiffusionPipeline
+
+        try:
+            if not (is_flax_available() and is_transformers_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_flax_and_transformers_objects import *
+        else:
+            from .controlnet import FlaxStableDiffusionControlNetPipeline
+            from .stable_diffusion import (
+                FlaxStableDiffusionImg2ImgPipeline,
+                FlaxStableDiffusionInpaintPipeline,
+                FlaxStableDiffusionPipeline,
+            )
+
+        try:
+            if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+
+        else:
+            from .spectrogram_diffusion import MidiProcessor, SpectrogramDiffusionPipeline
 
-import sys
-
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/alt_diffusion/__init__.py b/src/diffusers/pipelines/alt_diffusion/__init__.py
index c2e4db7eab1c..c0ad3b4a3486 100644
--- a/src/diffusers/pipelines/alt_diffusion/__init__.py
+++ b/src/diffusers/pipelines/alt_diffusion/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,8 +9,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -17,21 +19,34 @@
     from ...utils import dummy_torch_and_transformers_objects
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-
 else:
-    _import_structure["pipeline_output"] = ["AltDiffusionPipelineOutput"]
     _import_structure["modeling_roberta_series"] = ["RobertaSeriesModelWithTransformation"]
     _import_structure["pipeline_alt_diffusion"] = ["AltDiffusionPipeline"]
     _import_structure["pipeline_alt_diffusion_img2img"] = ["AltDiffusionImg2ImgPipeline"]
 
-import sys
+    _import_structure["pipeline_output"] = ["AltDiffusionPipelineOutput"]
 
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    else:
+        from .modeling_roberta_series import RobertaSeriesModelWithTransformation
+        from .pipeline_alt_diffusion import AltDiffusionPipeline
+        from .pipeline_alt_diffusion_img2img import AltDiffusionImg2ImgPipeline
+        from .pipeline_output import AltDiffusionPipelineOutput
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/audio_diffusion/__init__.py b/src/diffusers/pipelines/audio_diffusion/__init__.py
index 578a94693382..7880d02a79a3 100644
--- a/src/diffusers/pipelines/audio_diffusion/__init__.py
+++ b/src/diffusers/pipelines/audio_diffusion/__init__.py
@@ -1,18 +1,23 @@
-from ...utils import _LazyModule
+from typing import TYPE_CHECKING
 
+from ...utils import _LazyModule
 
-_import_structure = {}
-_dummy_objects = {}
 
-_import_structure["mel"] = ["Mel"]
-_import_structure["pipeline_audio_diffusion"] = ["AudioDiffusionPipeline"]
+_import_structure = {
+    "mel": ["Mel"],
+    "pipeline_audio_diffusion": ["AudioDiffusionPipeline"],
+}
 
-import sys
+if TYPE_CHECKING:
+    from .mel import Mel
+    from .pipeline_audio_diffusion import AudioDiffusionPipeline
 
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/audioldm/__init__.py b/src/diffusers/pipelines/audioldm/__init__.py
index 2acd5c25ed75..57c0fe46c324 100644
--- a/src/diffusers/pipelines/audioldm/__init__.py
+++ b/src/diffusers/pipelines/audioldm/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,9 +9,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
@@ -20,19 +21,30 @@
     )
 
     _dummy_objects.update({"AudioLDMPipeline": AudioLDMPipeline})
-
 else:
     _import_structure["pipeline_audioldm"] = ["AudioLDMPipeline"]
 
-import sys
 
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import (
+            AudioLDMPipeline,
+        )
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    else:
+        from .pipeline_audioldm import AudioLDMPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/audioldm2/__init__.py b/src/diffusers/pipelines/audioldm2/__init__.py
index 67001f8e44ca..50330c677452 100644
--- a/src/diffusers/pipelines/audioldm2/__init__.py
+++ b/src/diffusers/pipelines/audioldm2/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -8,9 +10,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
@@ -19,11 +20,23 @@
     from ...utils import dummy_torch_and_transformers_objects
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-
 else:
     _import_structure["modeling_audioldm2"] = ["AudioLDM2ProjectionModel", "AudioLDM2UNet2DConditionModel"]
     _import_structure["pipeline_audioldm2"] = ["AudioLDM2Pipeline"]
 
+
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
+        from .pipeline_audioldm2 import AudioLDM2Pipeline
+
+else:
     import sys
 
     sys.modules[__name__] = _LazyModule(
@@ -32,3 +45,5 @@
         _import_structure,
         module_spec=__spec__,
     )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/consistency_models/__init__.py b/src/diffusers/pipelines/consistency_models/__init__.py
index d1d2ab59500b..83fd1341d82a 100644
--- a/src/diffusers/pipelines/consistency_models/__init__.py
+++ b/src/diffusers/pipelines/consistency_models/__init__.py
@@ -1,17 +1,21 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     _LazyModule,
 )
 
 
-_import_structure = {}
-_import_structure["pipeline_consistency_models"] = ["ConsistencyModelPipeline"]
+_import_structure = {"pipeline_consistency_models": ["ConsistencyModelPipeline"]}
 
-import sys
+if TYPE_CHECKING:
+    from .pipeline_consistency_models import ConsistencyModelPipeline
 
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/controlnet/__init__.py b/src/diffusers/pipelines/controlnet/__init__.py
index 60b3fa0b7539..5c551533f3a8 100644
--- a/src/diffusers/pipelines/controlnet/__init__.py
+++ b/src/diffusers/pipelines/controlnet/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -8,50 +10,68 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
-
 except OptionalDependencyNotAvailable:
     from ...utils import dummy_torch_and_transformers_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-
-
 else:
     _import_structure["multicontrolnet"] = ["MultiControlNetModel"]
     _import_structure["pipeline_controlnet"] = ["StableDiffusionControlNetPipeline"]
     _import_structure["pipeline_controlnet_img2img"] = ["StableDiffusionControlNetImg2ImgPipeline"]
     _import_structure["pipeline_controlnet_inpaint"] = ["StableDiffusionControlNetInpaintPipeline"]
+    _import_structure["pipeline_controlnet_inpaint_sd_xl"] = ["StableDiffusionXLControlNetInpaintPipeline"]
     _import_structure["pipeline_controlnet_sd_xl"] = ["StableDiffusionXLControlNetPipeline"]
     _import_structure["pipeline_controlnet_sd_xl_img2img"] = ["StableDiffusionXLControlNetImg2ImgPipeline"]
-    _import_structure["pipeline_controlnet_inpaint_sd_xl"] = ["StableDiffusionXLControlNetInpaintPipeline"]
-
 try:
     if not (is_transformers_available() and is_flax_available()):
         raise OptionalDependencyNotAvailable()
-
 except OptionalDependencyNotAvailable:
     from ...utils import dummy_flax_and_transformers_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
-
 else:
     _import_structure["pipeline_flax_controlnet"] = ["FlaxStableDiffusionControlNetPipeline"]
 
-import sys
 
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .multicontrolnet import MultiControlNetModel
+        from .pipeline_controlnet import StableDiffusionControlNetPipeline
+        from .pipeline_controlnet_img2img import StableDiffusionControlNetImg2ImgPipeline
+        from .pipeline_controlnet_inpaint import StableDiffusionControlNetInpaintPipeline
+        from .pipeline_controlnet_inpaint_sd_xl import StableDiffusionXLControlNetInpaintPipeline
+        from .pipeline_controlnet_sd_xl import StableDiffusionXLControlNetPipeline
+        from .pipeline_controlnet_sd_xl_img2img import StableDiffusionXLControlNetImg2ImgPipeline
+
+    try:
+        if not (is_transformers_available() and is_flax_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_flax_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/dance_diffusion/__init__.py b/src/diffusers/pipelines/dance_diffusion/__init__.py
index 39f213b35a04..c777d437060c 100644
--- a/src/diffusers/pipelines/dance_diffusion/__init__.py
+++ b/src/diffusers/pipelines/dance_diffusion/__init__.py
@@ -1,16 +1,18 @@
-from ...utils import _LazyModule
-
+from typing import TYPE_CHECKING
 
-_import_structure = {}
-_import_structure["pipeline_dance_diffusion"] = ["DanceDiffusionPipeline"]
+from ...utils import _LazyModule
 
 
-import sys
+_import_structure = {"pipeline_dance_diffusion": ["DanceDiffusionPipeline"]}
 
+if TYPE_CHECKING:
+    from .pipeline_dance_diffusion import DanceDiffusionPipeline
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/ddim/__init__.py b/src/diffusers/pipelines/ddim/__init__.py
index 1715a2b6acbb..0121cd8f6dac 100644
--- a/src/diffusers/pipelines/ddim/__init__.py
+++ b/src/diffusers/pipelines/ddim/__init__.py
@@ -1,15 +1,18 @@
-from ...utils import _LazyModule
+from typing import TYPE_CHECKING
 
+from ...utils import _LazyModule
 
-_import_structure = {}
-_import_structure["pipeline_ddim"] = ["DDIMPipeline"]
 
-import sys
+_import_structure = {"pipeline_ddim": ["DDIMPipeline"]}
 
+if TYPE_CHECKING:
+    from .pipeline_ddim import DDIMPipeline
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/ddpm/__init__.py b/src/diffusers/pipelines/ddpm/__init__.py
index a3936af03a6a..f9320e0bc567 100644
--- a/src/diffusers/pipelines/ddpm/__init__.py
+++ b/src/diffusers/pipelines/ddpm/__init__.py
@@ -1,17 +1,21 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     _LazyModule,
 )
 
 
-_import_structure = {}
-_import_structure["pipeline_ddpm"] = ["DDPMPipeline"]
+_import_structure = {"pipeline_ddpm": ["DDPMPipeline"]}
 
-import sys
+if TYPE_CHECKING:
+    from .pipeline_ddpm import DDPMPipeline
 
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/deepfloyd_if/__init__.py b/src/diffusers/pipelines/deepfloyd_if/__init__.py
index a6d58cab9c81..bb0acffc6fa7 100644
--- a/src/diffusers/pipelines/deepfloyd_if/__init__.py
+++ b/src/diffusers/pipelines/deepfloyd_if/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,19 +9,19 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
-_import_structure["timesteps"] = [
-    "fast27_timesteps",
-    "smart27_timesteps",
-    "smart50_timesteps",
-    "smart100_timesteps",
-    "smart185_timesteps",
-    "super27_timesteps",
-    "super40_timesteps",
-    "super100_timesteps",
-]
+_import_structure = {
+    "timesteps": [
+        "fast27_timesteps",
+        "smart100_timesteps",
+        "smart185_timesteps",
+        "smart27_timesteps",
+        "smart50_timesteps",
+        "super100_timesteps",
+        "super27_timesteps",
+        "super40_timesteps",
+    ]
+}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -28,28 +30,55 @@
     from ...utils import dummy_torch_and_transformers_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-
 else:
-    _import_structure["pipeline_output"] = ["IFPipelineOutput"]
     _import_structure["pipeline_if"] = ["IFPipeline"]
     _import_structure["pipeline_if_img2img"] = ["IFImg2ImgPipeline"]
     _import_structure["pipeline_if_img2img_superresolution"] = ["IFImg2ImgSuperResolutionPipeline"]
     _import_structure["pipeline_if_inpainting"] = ["IFInpaintingPipeline"]
     _import_structure["pipeline_if_inpainting_superresolution"] = ["IFInpaintingSuperResolutionPipeline"]
     _import_structure["pipeline_if_superresolution"] = ["IFSuperResolutionPipeline"]
+    _import_structure["pipeline_output"] = ["IFPipelineOutput"]
     _import_structure["safety_checker"] = ["IFSafetyChecker"]
     _import_structure["watermark"] = ["IFWatermarker"]
 
 
-import sys
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
 
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_if import IFPipeline
+        from .pipeline_if_img2img import IFImg2ImgPipeline
+        from .pipeline_if_img2img_superresolution import IFImg2ImgSuperResolutionPipeline
+        from .pipeline_if_inpainting import IFInpaintingPipeline
+        from .pipeline_if_inpainting_superresolution import IFInpaintingSuperResolutionPipeline
+        from .pipeline_if_superresolution import IFSuperResolutionPipeline
+        from .pipeline_output import IFPipelineOutput
+        from .safety_checker import IFSafetyChecker
+        from .timesteps import (
+            fast27_timesteps,
+            smart27_timesteps,
+            smart50_timesteps,
+            smart100_timesteps,
+            smart185_timesteps,
+            super27_timesteps,
+            super40_timesteps,
+            super100_timesteps,
+        )
+        from .watermark import IFWatermarker
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/dit/__init__.py b/src/diffusers/pipelines/dit/__init__.py
index be3c74454393..a260779cafae 100644
--- a/src/diffusers/pipelines/dit/__init__.py
+++ b/src/diffusers/pipelines/dit/__init__.py
@@ -1,15 +1,19 @@
+from typing import TYPE_CHECKING
+
 from ...utils import _LazyModule
 
 
-_import_structure = {}
-_import_structure["pipeline_dit"] = ["DiTPipeline"]
+_import_structure = {"pipeline_dit": ["DiTPipeline"]}
 
-import sys
+if TYPE_CHECKING:
+    from .pipeline_dit import DiTPipeline
 
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index cc4580721eff..63b34e16c95a 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,9 +9,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -18,7 +19,6 @@
     from ...utils import dummy_torch_and_transformers_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-
 else:
     _import_structure["pipeline_kandinsky"] = ["KandinskyPipeline"]
     _import_structure["pipeline_kandinsky_combined"] = [
@@ -32,15 +32,34 @@
     _import_structure["text_encoder"] = ["MultilingualCLIP"]
 
 
-import sys
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
 
+    else:
+        from .pipeline_kandinsky import KandinskyPipeline
+        from .pipeline_kandinsky_combined import (
+            KandinskyCombinedPipeline,
+            KandinskyImg2ImgCombinedPipeline,
+            KandinskyInpaintCombinedPipeline,
+        )
+        from .pipeline_kandinsky_img2img import KandinskyImg2ImgPipeline
+        from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
+        from .pipeline_kandinsky_prior import KandinskyPriorPipeline, KandinskyPriorPipelineOutput
+        from .text_encoder import MultilingualCLIP
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/kandinsky2_2/__init__.py b/src/diffusers/pipelines/kandinsky2_2/__init__.py
index 639d6ad977c2..461e3d25ca73 100644
--- a/src/diffusers/pipelines/kandinsky2_2/__init__.py
+++ b/src/diffusers/pipelines/kandinsky2_2/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,9 +9,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -18,7 +19,6 @@
     from ...utils import dummy_torch_and_transformers_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-
 else:
     _import_structure["pipeline_kandinsky2_2"] = ["KandinskyV22Pipeline"]
     _import_structure["pipeline_kandinsky2_2_combined"] = [
@@ -34,15 +34,36 @@
     _import_structure["pipeline_kandinsky2_2_prior_emb2emb"] = ["KandinskyV22PriorEmb2EmbPipeline"]
 
 
-import sys
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
 
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_kandinsky2_2 import KandinskyV22Pipeline
+        from .pipeline_kandinsky2_2_combined import (
+            KandinskyV22CombinedPipeline,
+            KandinskyV22Img2ImgCombinedPipeline,
+            KandinskyV22InpaintCombinedPipeline,
+        )
+        from .pipeline_kandinsky2_2_controlnet import KandinskyV22ControlnetPipeline
+        from .pipeline_kandinsky2_2_controlnet_img2img import KandinskyV22ControlnetImg2ImgPipeline
+        from .pipeline_kandinsky2_2_img2img import KandinskyV22Img2ImgPipeline
+        from .pipeline_kandinsky2_2_inpainting import KandinskyV22InpaintPipeline
+        from .pipeline_kandinsky2_2_prior import KandinskyV22PriorPipeline
+        from .pipeline_kandinsky2_2_prior_emb2emb import KandinskyV22PriorEmb2EmbPipeline
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/latent_diffusion/__init__.py b/src/diffusers/pipelines/latent_diffusion/__init__.py
index a78e6622bcfe..bc6ac82217a3 100644
--- a/src/diffusers/pipelines/latent_diffusion/__init__.py
+++ b/src/diffusers/pipelines/latent_diffusion/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,9 +9,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -23,15 +24,26 @@
     _import_structure["pipeline_latent_diffusion_superresolution"] = ["LDMSuperResolutionPipeline"]
 
 
-import sys
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
 
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_latent_diffusion import LDMBertModel, LDMTextToImagePipeline
+        from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/latent_diffusion_uncond/__init__.py b/src/diffusers/pipelines/latent_diffusion_uncond/__init__.py
index 73e5c703f61a..2dd64d353513 100644
--- a/src/diffusers/pipelines/latent_diffusion_uncond/__init__.py
+++ b/src/diffusers/pipelines/latent_diffusion_uncond/__init__.py
@@ -1,15 +1,18 @@
-from ...utils import _LazyModule
+from typing import TYPE_CHECKING
 
+from ...utils import _LazyModule
 
-_import_structure = {}
-_import_structure["pipeline_latent_diffusion_uncond"] = ["LDMPipeline"]
 
-import sys
+_import_structure = {"pipeline_latent_diffusion_uncond": ["LDMPipeline"]}
 
+if TYPE_CHECKING:
+    from .pipeline_latent_diffusion_uncond import LDMPipeline
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/musicldm/__init__.py b/src/diffusers/pipelines/musicldm/__init__.py
index 6228f763a53b..e49eb1f16d7d 100644
--- a/src/diffusers/pipelines/musicldm/__init__.py
+++ b/src/diffusers/pipelines/musicldm/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -8,8 +10,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
@@ -18,19 +20,29 @@
     from ...utils import dummy_torch_and_transformers_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-
 else:
     _import_structure["pipeline_musicldm"] = ["MusicLDMPipeline"]
 
-import sys
 
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+            raise OptionalDependencyNotAvailable()
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_musicldm import MusicLDMPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/paint_by_example/__init__.py b/src/diffusers/pipelines/paint_by_example/__init__.py
index c19ce1036e3f..bfe4810e5ab5 100644
--- a/src/diffusers/pipelines/paint_by_example/__init__.py
+++ b/src/diffusers/pipelines/paint_by_example/__init__.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 import numpy as np
 import PIL
@@ -14,8 +14,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -28,15 +28,27 @@
     _import_structure["image_encoder"] = ["PaintByExampleImageEncoder"]
     _import_structure["pipeline_paint_by_example"] = ["PaintByExamplePipeline"]
 
-import sys
 
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .image_encoder import PaintByExampleImageEncoder
+        from .pipeline_paint_by_example import PaintByExamplePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/pndm/__init__.py b/src/diffusers/pipelines/pndm/__init__.py
index 7374016c32d9..4764b1b7594a 100644
--- a/src/diffusers/pipelines/pndm/__init__.py
+++ b/src/diffusers/pipelines/pndm/__init__.py
@@ -1,16 +1,18 @@
-from ...utils import _LazyModule
-
+from typing import TYPE_CHECKING
 
-_import_structure = {}
-_import_structure["pipeline_pndm"] = ["PNDMPipeline"]
+from ...utils import _LazyModule
 
 
-import sys
+_import_structure = {"pipeline_pndm": ["PNDMPipeline"]}
 
+if TYPE_CHECKING:
+    from .pipeline_pndm import PNDMPipeline
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/repaint/__init__.py b/src/diffusers/pipelines/repaint/__init__.py
index 2a0eedf30bbf..ffed0c2ab05c 100644
--- a/src/diffusers/pipelines/repaint/__init__.py
+++ b/src/diffusers/pipelines/repaint/__init__.py
@@ -1,15 +1,19 @@
+from typing import TYPE_CHECKING
+
 from ...utils import _LazyModule
 
 
-_import_structure = {}
-_import_structure["pipeline_repaint"] = ["RePaintPipeline"]
+_import_structure = {"pipeline_repaint": ["RePaintPipeline"]}
 
-import sys
+if TYPE_CHECKING:
+    from .pipeline_repaint import RePaintPipeline
 
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/score_sde_ve/__init__.py b/src/diffusers/pipelines/score_sde_ve/__init__.py
index 2cd7ac2bf440..192467afd11a 100644
--- a/src/diffusers/pipelines/score_sde_ve/__init__.py
+++ b/src/diffusers/pipelines/score_sde_ve/__init__.py
@@ -1,15 +1,19 @@
+from typing import TYPE_CHECKING
+
 from ...utils import _LazyModule
 
 
-_import_structure = {}
-_import_structure["pipeline_score_sde_ve"] = ["ScoreSdeVePipeline"]
+_import_structure = {"pipeline_score_sde_ve": ["ScoreSdeVePipeline"]}
 
-import sys
+if TYPE_CHECKING:
+    from .pipeline_score_sde_ve import ScoreSdeVePipeline
 
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py b/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
index 1b743ac3d58d..96842bc84225 100644
--- a/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,9 +9,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -18,21 +19,30 @@
     from ...utils import dummy_torch_and_transformers_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-
 else:
     _import_structure["pipeline_output"] = ["SemanticStableDiffusionPipelineOutput"]
     _import_structure["pipeline_semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
 
 
-import sys
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
 
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_semantic_stable_diffusion import SemanticStableDiffusionPipeline
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/shap_e/__init__.py b/src/diffusers/pipelines/shap_e/__init__.py
index 2a56148fee91..13a9fc1aa1cb 100644
--- a/src/diffusers/pipelines/shap_e/__init__.py
+++ b/src/diffusers/pipelines/shap_e/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,9 +9,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -33,15 +34,37 @@
         "VoidNeRFModel",
     ]
 
-import sys
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
 
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .camera import create_pan_cameras
+        from .pipeline_shap_e import ShapEPipeline
+        from .pipeline_shap_e_img2img import ShapEImg2ImgPipeline
+        from .renderer import (
+            BoundingBoxVolume,
+            ImportanceRaySampler,
+            MLPNeRFModelOutput,
+            MLPNeRSTFModel,
+            ShapEParamsProjModel,
+            ShapERenderer,
+            StratifiedRaySampler,
+            VoidNeRFModel,
+        )
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
index e8bcf63c2986..e6e8393aa8af 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
@@ -1,4 +1,5 @@
 # flake8: noqa
+from typing import TYPE_CHECKING
 from ...utils import (
     _LazyModule,
     is_note_seq_available,
@@ -8,9 +9,8 @@
     get_objects_from_module,
 )
 
-_import_structure = {}
 _dummy_objects = {}
-
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -19,32 +19,56 @@
     from ...utils import dummy_torch_and_transformers_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-
 else:
-    _import_structure["notes_encoder"] = ["SpectrogramNotesEncoder"]
     _import_structure["continous_encoder"] = ["SpectrogramContEncoder"]
+    _import_structure["notes_encoder"] = ["SpectrogramNotesEncoder"]
     _import_structure["pipeline_spectrogram_diffusion"] = [
         "SpectrogramContEncoder",
         "SpectrogramDiffusionPipeline",
         "T5FilmDecoder",
     ]
-
 try:
     if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+    from ...utils import dummy_transformers_and_torch_and_note_seq_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
 else:
     _import_structure["midi_utils"] = ["MidiProcessor"]
 
-import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_spectrogram_diffusion import SpectrogramDiffusionPipeline
+        from .pipeline_spectrogram_diffusion import SpectrogramContEncoder
+        from .pipeline_spectrogram_diffusion import SpectrogramNotesEncoder
+        from .pipeline_spectrogram_diffusion import T5FilmDecoder
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_transformers_and_torch_and_note_seq_objects import *
+
+    else:
+        from .midi_utils import MidiProcessor
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index f6f3327c5fb6..57dff8568203 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -12,15 +14,12 @@
 )
 
 
-_import_structure = {}
-_additional_imports = {}
 _dummy_objects = {}
-
-_import_structure["pipeline_output"] = ["StableDiffusionPipelineOutput"]
+_additional_imports = {}
+_import_structure = {"pipeline_output": ["StableDiffusionPipelineOutput"]}
 
 if is_transformers_available() and is_flax_available():
     _import_structure["pipeline_output"].extend(["FlaxStableDiffusionPipelineOutput"])
-
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
@@ -28,12 +27,14 @@
     from ...utils import dummy_torch_and_transformers_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-
 else:
+    _import_structure["clip_image_project_model"] = ["CLIPImageProjection"]
     _import_structure["pipeline_cycle_diffusion"] = ["CycleDiffusionPipeline"]
     _import_structure["pipeline_stable_diffusion"] = ["StableDiffusionPipeline"]
     _import_structure["pipeline_stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
     _import_structure["pipeline_stable_diffusion_gligen"] = ["StableDiffusionGLIGENPipeline"]
+    _import_structure["pipeline_stable_diffusion_gligen"] = ["StableDiffusionGLIGENPipeline"]
+    _import_structure["pipeline_stable_diffusion_gligen_text_image"] = ["StableDiffusionGLIGENTextImagePipeline"]
     _import_structure["pipeline_stable_diffusion_img2img"] = ["StableDiffusionImg2ImgPipeline"]
     _import_structure["pipeline_stable_diffusion_inpaint"] = ["StableDiffusionInpaintPipeline"]
     _import_structure["pipeline_stable_diffusion_inpaint_legacy"] = ["StableDiffusionInpaintPipelineLegacy"]
@@ -49,10 +50,6 @@
     _import_structure["pipeline_stable_unclip_img2img"] = ["StableUnCLIPImg2ImgPipeline"]
     _import_structure["safety_checker"] = ["StableDiffusionSafetyChecker"]
     _import_structure["stable_unclip_image_normalizer"] = ["StableUnCLIPImageNormalizer"]
-    _import_structure["pipeline_stable_diffusion_gligen_text_image"] = ["StableDiffusionGLIGENTextImagePipeline"]
-    _import_structure["pipeline_stable_diffusion_gligen"] = ["StableDiffusionGLIGENPipeline"]
-    _import_structure["clip_image_project_model"] = ["CLIPImageProjection"]
-
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
         raise OptionalDependencyNotAvailable()
@@ -62,8 +59,6 @@
     _dummy_objects.update({"StableDiffusionImageVariationPipeline": StableDiffusionImageVariationPipeline})
 else:
     _import_structure["pipeline_stable_diffusion_image_variation"] = ["StableDiffusionImageVariationPipeline"]
-
-
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.26.0")):
         raise OptionalDependencyNotAvailable()
@@ -85,8 +80,6 @@
     _import_structure["pipeline_stable_diffusion_depth2img"] = ["StableDiffusionDepth2ImgPipeline"]
     _import_structure["pipeline_stable_diffusion_diffedit"] = ["StableDiffusionDiffEditPipeline"]
     _import_structure["pipeline_stable_diffusion_pix2pix_zero"] = ["StableDiffusionPix2PixZeroPipeline"]
-
-
 try:
     if not (
         is_torch_available()
@@ -99,10 +92,8 @@
     from ...utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
-
 else:
     _import_structure["pipeline_stable_diffusion_k_diffusion"] = ["StableDiffusionKDiffusionPipeline"]
-
 try:
     if not (is_transformers_available() and is_onnx_available()):
         raise OptionalDependencyNotAvailable()
@@ -110,7 +101,6 @@
     from ...utils import dummy_onnx_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_onnx_objects))
-
 else:
     _import_structure["pipeline_onnx_stable_diffusion"] = [
         "OnnxStableDiffusionPipeline",
@@ -125,23 +115,120 @@
     from ...schedulers.scheduling_pndm_flax import PNDMSchedulerState
 
     _additional_imports.update({"PNDMSchedulerState": PNDMSchedulerState})
-
     _import_structure["pipeline_flax_stable_diffusion"] = ["FlaxStableDiffusionPipeline"]
     _import_structure["pipeline_flax_stable_diffusion_img2img"] = ["FlaxStableDiffusionImg2ImgPipeline"]
     _import_structure["pipeline_flax_stable_diffusion_inpaint"] = ["FlaxStableDiffusionInpaintPipeline"]
     _import_structure["safety_checker_flax"] = ["FlaxStableDiffusionSafetyChecker"]
 
-import sys
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .clip_image_project_model import CLIPImageProjection
+        from .pipeline_stable_diffusion import (
+            StableDiffusionPipeline,
+            StableDiffusionPipelineOutput,
+            StableDiffusionSafetyChecker,
+        )
+        from .pipeline_stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
+        from .pipeline_stable_diffusion_gligen import StableDiffusionGLIGENPipeline
+        from .pipeline_stable_diffusion_gligen_text_image import StableDiffusionGLIGENTextImagePipeline
+        from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
+        from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
+        from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
+        from .pipeline_stable_diffusion_instruct_pix2pix import StableDiffusionInstructPix2PixPipeline
+        from .pipeline_stable_diffusion_latent_upscale import StableDiffusionLatentUpscalePipeline
+        from .pipeline_stable_diffusion_ldm3d import StableDiffusionLDM3DPipeline
+        from .pipeline_stable_diffusion_model_editing import StableDiffusionModelEditingPipeline
+        from .pipeline_stable_diffusion_panorama import StableDiffusionPanoramaPipeline
+        from .pipeline_stable_diffusion_paradigms import StableDiffusionParadigmsPipeline
+        from .pipeline_stable_diffusion_sag import StableDiffusionSAGPipeline
+        from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline
+        from .pipeline_stable_unclip import StableUnCLIPPipeline
+        from .pipeline_stable_unclip_img2img import StableUnCLIPImg2ImgPipeline
+        from .safety_checker import StableDiffusionSafetyChecker
+        from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import StableDiffusionImageVariationPipeline
+    else:
+        from .pipeline_stable_diffusion_image_variation import StableDiffusionImageVariationPipeline
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.26.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import (
+            StableDiffusionDepth2ImgPipeline,
+            StableDiffusionDiffEditPipeline,
+            StableDiffusionPix2PixZeroPipeline,
+        )
+    else:
+        from .pipeline_stable_diffusion_depth2img import StableDiffusionDepth2ImgPipeline
+        from .pipeline_stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
+        from .pipeline_stable_diffusion_pix2pix_zero import StableDiffusionPix2PixZeroPipeline
+
+    try:
+        if not (
+            is_torch_available()
+            and is_transformers_available()
+            and is_k_diffusion_available()
+            and is_k_diffusion_version(">=", "0.0.12")
+        ):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_and_k_diffusion_objects import *
+    else:
+        from .pipeline_stable_diffusion_k_diffusion import StableDiffusionKDiffusionPipeline
+
+    try:
+        if not (is_transformers_available() and is_onnx_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_onnx_objects import *
+    else:
+        from .pipeline_onnx_stable_diffusion import (
+            OnnxStableDiffusionImg2ImgPipeline,
+            OnnxStableDiffusionInpaintPipeline,
+            OnnxStableDiffusionInpaintPipelineLegacy,
+            OnnxStableDiffusionPipeline,
+            OnnxStableDiffusionUpscalePipeline,
+            StableDiffusionOnnxPipeline,
+        )
+
+    try:
+        if not (is_transformers_available() and is_flax_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_flax_objects import *
+    else:
+        from .pipeline_flax_stable_diffusion import (
+            FlaxStableDiffusionImg2ImgPipeline,
+            FlaxStableDiffusionInpaintPipeline,
+            FlaxStableDiffusionPipeline,
+            FlaxStableDiffusionSafetyChecker,
+        )
+        from .pipeline_output import FlaxStableDiffusionPipelineOutput
 
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
-for name, value in _additional_imports.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
+    for name, value in _additional_imports.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/__init__.py b/src/diffusers/pipelines/stable_diffusion_safe/__init__.py
index 8ce71c01328a..67c6ab1f6686 100644
--- a/src/diffusers/pipelines/stable_diffusion_safe/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/__init__.py
@@ -1,12 +1,19 @@
 from dataclasses import dataclass
 from enum import Enum
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 import numpy as np
 import PIL
 from PIL import Image
 
-from ...utils import BaseOutput, OptionalDependencyNotAvailable, is_torch_available, is_transformers_available
+from ...utils import (
+    BaseOutput,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
 
 
 @dataclass
@@ -41,36 +48,47 @@ class SafetyConfig(object):
     }
 
 
-@dataclass
-class StableDiffusionSafePipelineOutput(BaseOutput):
-    """
-    Output class for Safe Stable Diffusion pipelines.
-
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-        nsfw_content_detected (`List[bool]`)
-            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, or `None` if safety checking could not be performed.
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images that were flagged by the safety checker any may contain "not-safe-for-work"
-            (nsfw) content, or `None` if no safety check was performed or no images were flagged.
-        applied_safety_concept (`str`)
-            The safety concept that was applied for safety guidance, or `None` if safety guidance was disabled
-    """
+_dummy_objects = {}
+_additional_imports = {}
+_import_structure = {
+    "pipeline_output": ["StableDiffusionSafePipelineOutput"],
+    "pipeline_stable_diffusion_safe": ["StableDiffusionPipelineSafe"],
+    "safety_checker": ["StableDiffusionSafetyChecker"],
+}
+_additional_imports.update({"SafetyConfig": SafetyConfig})
 
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_content_detected: Optional[List[bool]]
-    unsafe_images: Optional[Union[List[PIL.Image.Image], np.ndarray]]
-    applied_safety_concept: Optional[str]
 
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_output import StableDiffusionSafePipelineOutput
+        from .pipeline_stable_diffusion_safe import StableDiffusionPipelineSafe
+        from .safety_checker import SafeStableDiffusionSafetyChecker
 
-try:
-    if not (is_transformers_available() and is_torch_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import *
 else:
-    from .pipeline_stable_diffusion_safe import StableDiffusionPipelineSafe
-    from .safety_checker import SafeStableDiffusionSafetyChecker
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils import dummy_torch_and_transformers_objects
+
+        _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+
+    else:
+        import sys
+
+        sys.modules[__name__] = _LazyModule(
+            __name__,
+            globals()["__file__"],
+            _import_structure,
+            module_spec=__spec__,
+        )
+
+        for name, value in _dummy_objects.items():
+            setattr(sys.modules[__name__], name, value)
+        for name, value in _additional_imports.items():
+            setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py
new file mode 100644
index 000000000000..8567a304c696
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py
@@ -0,0 +1,34 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+
+from ...utils import (
+    BaseOutput,
+)
+
+
+@dataclass
+class StableDiffusionSafePipelineOutput(BaseOutput):
+    """
+    Output class for Safe Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+        nsfw_content_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, or `None` if safety checking could not be performed.
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images that were flagged by the safety checker any may contain "not-safe-for-work"
+            (nsfw) content, or `None` if no safety check was performed or no images were flagged.
+        applied_safety_concept (`str`)
+            The safety concept that was applied for safety guidance, or `None` if safety guidance was disabled
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+    unsafe_images: Optional[Union[List[PIL.Image.Image], np.ndarray]]
+    applied_safety_concept: Optional[str]
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/__init__.py b/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
index ebe12db15fd9..2c4bf44f8dec 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,10 +9,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
-_import_structure["pipeline_output"] = ["StableDiffusionXLPipelineOutput"]
+_import_structure = {"pipeline_output": ["StableDiffusionXLPipelineOutput"]}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -25,15 +25,28 @@
     _import_structure["pipeline_stable_diffusion_xl_inpaint"] = ["StableDiffusionXLInpaintPipeline"]
     _import_structure["pipeline_stable_diffusion_xl_instruct_pix2pix"] = ["StableDiffusionXLInstructPix2PixPipeline"]
 
-import sys
 
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_stable_diffusion_xl import StableDiffusionXLPipeline
+        from .pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipeline
+        from .pipeline_stable_diffusion_xl_inpaint import StableDiffusionXLInpaintPipeline
+        from .pipeline_stable_diffusion_xl_instruct_pix2pix import StableDiffusionXLInstructPix2PixPipeline
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/stochastic_karras_ve/__init__.py b/src/diffusers/pipelines/stochastic_karras_ve/__init__.py
index 2f82b438c5e3..ebc6506f58b3 100644
--- a/src/diffusers/pipelines/stochastic_karras_ve/__init__.py
+++ b/src/diffusers/pipelines/stochastic_karras_ve/__init__.py
@@ -1,15 +1,19 @@
+from typing import TYPE_CHECKING
+
 from ...utils import _LazyModule
 
 
-_import_structure = {}
-_import_structure["pipeline_stochastic_karras_ve"] = ["KarrasVePipeline"]
+_import_structure = {"pipeline_stochastic_karras_ve": ["KarrasVePipeline"]}
 
-import sys
+if TYPE_CHECKING:
+    from .pipeline_stochastic_karras_ve import KarrasVePipeline
 
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/t2i_adapter/__init__.py b/src/diffusers/pipelines/t2i_adapter/__init__.py
index b6e6ee724a67..db5dd4ff21b6 100644
--- a/src/diffusers/pipelines/t2i_adapter/__init__.py
+++ b/src/diffusers/pipelines/t2i_adapter/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,8 +9,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -21,14 +23,24 @@
     _import_structure["pipeline_stable_diffusion_adapter"] = ["StableDiffusionAdapterPipeline"]
     _import_structure["pipeline_stable_diffusion_xl_adapter"] = ["StableDiffusionXLAdapterPipeline"]
 
-import sys
 
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_stable_diffusion_adapter import StableDiffusionAdapterPipeline
+        from .pipeline_stable_diffusion_xl_adapter import StableDiffusionXLAdapterPipeline
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/__init__.py b/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
index af3b9bfde1ce..a09a63476b7c 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,8 +9,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -17,19 +19,31 @@
     from ...utils import dummy_torch_and_transformers_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-
 else:
     _import_structure["pipeline_output"] = ["TextToVideoSDPipelineOutput"]
     _import_structure["pipeline_text_to_video_synth"] = ["TextToVideoSDPipeline"]
     _import_structure["pipeline_text_to_video_synth_img2img"] = ["VideoToVideoSDPipeline"]
     _import_structure["pipeline_text_to_video_zero"] = ["TextToVideoZeroPipeline"]
 
-import sys
 
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_output import TextToVideoSDPipelineOutput
+        from .pipeline_text_to_video_synth import TextToVideoSDPipeline
+        from .pipeline_text_to_video_synth_img2img import VideoToVideoSDPipeline
+        from .pipeline_text_to_video_zero import TextToVideoZeroPipeline
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/unclip/__init__.py b/src/diffusers/pipelines/unclip/__init__.py
index f546dbb5041d..6d6a6398bcec 100644
--- a/src/diffusers/pipelines/unclip/__init__.py
+++ b/src/diffusers/pipelines/unclip/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,9 +9,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
@@ -25,14 +26,26 @@
     _import_structure["pipeline_unclip_image_variation"] = ["UnCLIPImageVariationPipeline"]
     _import_structure["text_proj"] = ["UnCLIPTextProjModel"]
 
-import sys
 
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_unclip import UnCLIPPipeline
+        from .pipeline_unclip_image_variation import UnCLIPImageVariationPipeline
+        from .text_proj import UnCLIPTextProjModel
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/unidiffuser/__init__.py b/src/diffusers/pipelines/unidiffuser/__init__.py
index ac0207b6045d..52bdb0c40552 100644
--- a/src/diffusers/pipelines/unidiffuser/__init__.py
+++ b/src/diffusers/pipelines/unidiffuser/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -6,9 +8,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -22,21 +23,35 @@
     _dummy_objects.update(
         {"ImageTextPipelineOutput": ImageTextPipelineOutput, "UniDiffuserPipeline": UniDiffuserPipeline}
     )
-
 else:
     _import_structure["modeling_text_decoder"] = ["UniDiffuserTextDecoder"]
     _import_structure["modeling_uvit"] = ["UniDiffuserModel", "UTransformer2DModel"]
     _import_structure["pipeline_unidiffuser"] = ["ImageTextPipelineOutput", "UniDiffuserPipeline"]
 
-import sys
 
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import (
+            ImageTextPipelineOutput,
+            UniDiffuserPipeline,
+        )
+    else:
+        from .modeling_text_decoder import UniDiffuserTextDecoder
+        from .modeling_uvit import UniDiffuserModel, UTransformer2DModel
+        from .pipeline_unidiffuser import ImageTextPipelineOutput, UniDiffuserPipeline
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/versatile_diffusion/__init__.py b/src/diffusers/pipelines/versatile_diffusion/__init__.py
index 8fbe932b18a6..ba7019c24d94 100644
--- a/src/diffusers/pipelines/versatile_diffusion/__init__.py
+++ b/src/diffusers/pipelines/versatile_diffusion/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,9 +9,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
@@ -37,15 +38,33 @@
     _import_structure["pipeline_versatile_diffusion_image_variation"] = ["VersatileDiffusionImageVariationPipeline"]
     _import_structure["pipeline_versatile_diffusion_text_to_image"] = ["VersatileDiffusionTextToImagePipeline"]
 
-import sys
 
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import (
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+        )
+    else:
+        from .pipeline_versatile_diffusion import VersatileDiffusionPipeline
+        from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline
+        from .pipeline_versatile_diffusion_image_variation import VersatileDiffusionImageVariationPipeline
+        from .pipeline_versatile_diffusion_text_to_image import VersatileDiffusionTextToImagePipeline
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
 
-for name, value in _dummy_objects.items():
-    setattr(sys.modules[__name__], name, value)
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/vq_diffusion/__init__.py b/src/diffusers/pipelines/vq_diffusion/__init__.py
index 8917802c2694..b8fb7f55e8ce 100644
--- a/src/diffusers/pipelines/vq_diffusion/__init__.py
+++ b/src/diffusers/pipelines/vq_diffusion/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -6,9 +8,8 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
-
+_import_structure = {}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -28,12 +29,25 @@
 else:
     _import_structure["pipeline_vq_diffusion"] = ["LearnedClassifierFreeSamplingEmbeddings", "VQDiffusionPipeline"]
 
-import sys
 
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import (
+            LearnedClassifierFreeSamplingEmbeddings,
+            VQDiffusionPipeline,
+        )
+    else:
+        from .pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings, VQDiffusionPipeline
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/wuerstchen/__init__.py b/src/diffusers/pipelines/wuerstchen/__init__.py
index 17da4c1ad8c5..13407f2cd10c 100644
--- a/src/diffusers/pipelines/wuerstchen/__init__.py
+++ b/src/diffusers/pipelines/wuerstchen/__init__.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -7,17 +9,16 @@
 )
 
 
-_import_structure = {}
 _dummy_objects = {}
+_import_structure = {}
+
 try:
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
-
 except OptionalDependencyNotAvailable:
     from ...utils import dummy_torch_and_transformers_objects
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-
 else:
     _import_structure["modeling_paella_vq_model"] = ["PaellaVQModel"]
     _import_structure["modeling_wuerstchen_diffnext"] = ["WuerstchenDiffNeXt"]
@@ -27,12 +28,26 @@
     _import_structure["pipeline_wuerstchen_prior"] = ["DEFAULT_STAGE_C_TIMESTEPS", "WuerstchenPriorPipeline"]
 
 
-import sys
-
+if TYPE_CHECKING:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .modeling_paella_vq_model import PaellaVQModel
+        from .modeling_wuerstchen_diffnext import WuerstchenDiffNeXt
+        from .modeling_wuerstchen_prior import WuerstchenPrior
+        from .pipeline_wuerstchen import WuerstchenDecoderPipeline
+        from .pipeline_wuerstchen_combined import WuerstchenCombinedPipeline
+        from .pipeline_wuerstchen_prior import WuerstchenPriorPipeline
 
-sys.modules[__name__] = _LazyModule(
-    __name__,
-    globals()["__file__"],
-    _import_structure,
-    module_spec=__spec__,
-)
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
index 270e10cdbe18..bbd943e56cec 100644
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import TYPE_CHECKING
 
 from ..utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
+    get_objects_from_module,
     is_flax_available,
     is_scipy_available,
     is_torch_available,
@@ -23,8 +25,8 @@
 )
 
 
-_import_structure = {}
 _dummy_modules = {}
+_import_structure = {}
 
 try:
     if not is_torch_available():
@@ -32,12 +34,7 @@
 except OptionalDependencyNotAvailable:
     from ..utils import dummy_pt_objects  # noqa F403
 
-    modules = {}
-    for name in dir(dummy_pt_objects):
-        if (not name.endswith("Scheduler")) or name.startswith("_"):
-            continue
-        modules[name] = getattr(dummy_pt_objects, name)
-    _dummy_modules.update(modules)
+    _dummy_modules.update(get_objects_from_module(dummy_pt_objects))
 
 else:
     _import_structure["scheduling_consistency_models"] = ["CMStochasticIterativeScheduler"]
@@ -46,6 +43,7 @@
     _import_structure["scheduling_ddim_parallel"] = ["DDIMParallelScheduler"]
     _import_structure["scheduling_ddpm"] = ["DDPMScheduler"]
     _import_structure["scheduling_ddpm_parallel"] = ["DDPMParallelScheduler"]
+    _import_structure["scheduling_ddpm_wuerstchen"] = ["DDPMWuerstchenScheduler"]
     _import_structure["scheduling_deis_multistep"] = ["DEISMultistepScheduler"]
     _import_structure["scheduling_dpmsolver_multistep"] = ["DPMSolverMultistepScheduler"]
     _import_structure["scheduling_dpmsolver_multistep_inverse"] = ["DPMSolverMultistepInverseScheduler"]
@@ -65,13 +63,15 @@
     _import_structure["scheduling_unipc_multistep"] = ["UniPCMultistepScheduler"]
     _import_structure["scheduling_utils"] = ["KarrasDiffusionSchedulers", "SchedulerMixin"]
     _import_structure["scheduling_vq_diffusion"] = ["VQDiffusionScheduler"]
-    _import_structure["scheduling_ddpm_wuerstchen"] = ["DDPMWuerstchenScheduler"]
 
 try:
     if not is_flax_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils.dummy_flax_objects import *  # noqa F403
+    from ..utils import dummy_flax_objects  # noqa F403
+
+    _dummy_modules.update(get_objects_from_module(dummy_flax_objects))
+
 else:
     _import_structure["scheduling_ddim_flax"] = ["FlaxDDIMScheduler"]
     _import_structure["scheduling_ddpm_flax"] = ["FlaxDDPMScheduler"]
@@ -94,13 +94,7 @@
 except OptionalDependencyNotAvailable:
     from ..utils import dummy_torch_and_scipy_objects  # noqa F403
 
-    modules = {}
-    for name in dir(dummy_torch_and_scipy_objects):
-        if (not name.endswith("Scheduler")) or name.startswith("_"):
-            continue
-        modules[name] = getattr(dummy_torch_and_scipy_objects, name)
-
-    _dummy_modules.update(modules)
+    _dummy_modules.update(get_objects_from_module(dummy_torch_and_scipy_objects))
 
 else:
     _import_structure["scheduling_lms_discrete"] = ["LMSDiscreteScheduler"]
@@ -111,21 +105,92 @@
 except OptionalDependencyNotAvailable:
     from ..utils import dummy_torch_and_torchsde_objects  # noqa F403
 
-    modules = {}
-    for name in dir(dummy_torch_and_torchsde_objects):
-        if (not name.endswith("Scheduler")) or name.startswith("_"):
-            continue
-        modules[name] = getattr(dummy_torch_and_torchsde_objects, name)
-
-    _dummy_modules.update(modules)
-
+    _dummy_modules.update(get_objects_from_module(dummy_torch_and_torchsde_objects))
 
 else:
     _import_structure["scheduling_dpmsolver_sde"] = ["DPMSolverSDEScheduler"]
 
-import sys
+if TYPE_CHECKING:
+    from ..utils import (
+        OptionalDependencyNotAvailable,
+        is_flax_available,
+        is_scipy_available,
+        is_torch_available,
+        is_torchsde_available,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_pt_objects import *  # noqa F403
+    else:
+        from .scheduling_consistency_models import CMStochasticIterativeScheduler
+        from .scheduling_ddim import DDIMScheduler
+        from .scheduling_ddim_inverse import DDIMInverseScheduler
+        from .scheduling_ddim_parallel import DDIMParallelScheduler
+        from .scheduling_ddpm import DDPMScheduler
+        from .scheduling_ddpm_parallel import DDPMParallelScheduler
+        from .scheduling_ddpm_wuerstchen import DDPMWuerstchenScheduler
+        from .scheduling_deis_multistep import DEISMultistepScheduler
+        from .scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
+        from .scheduling_dpmsolver_multistep_inverse import DPMSolverMultistepInverseScheduler
+        from .scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
+        from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
+        from .scheduling_euler_discrete import EulerDiscreteScheduler
+        from .scheduling_heun_discrete import HeunDiscreteScheduler
+        from .scheduling_ipndm import IPNDMScheduler
+        from .scheduling_k_dpm_2_ancestral_discrete import KDPM2AncestralDiscreteScheduler
+        from .scheduling_k_dpm_2_discrete import KDPM2DiscreteScheduler
+        from .scheduling_karras_ve import KarrasVeScheduler
+        from .scheduling_pndm import PNDMScheduler
+        from .scheduling_repaint import RePaintScheduler
+        from .scheduling_sde_ve import ScoreSdeVeScheduler
+        from .scheduling_sde_vp import ScoreSdeVpScheduler
+        from .scheduling_unclip import UnCLIPScheduler
+        from .scheduling_unipc_multistep import UniPCMultistepScheduler
+        from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+        from .scheduling_vq_diffusion import VQDiffusionScheduler
+
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_flax_objects import *  # noqa F403
+    else:
+        from .scheduling_ddim_flax import FlaxDDIMScheduler
+        from .scheduling_ddpm_flax import FlaxDDPMScheduler
+        from .scheduling_dpmsolver_multistep_flax import FlaxDPMSolverMultistepScheduler
+        from .scheduling_karras_ve_flax import FlaxKarrasVeScheduler
+        from .scheduling_lms_discrete_flax import FlaxLMSDiscreteScheduler
+        from .scheduling_pndm_flax import FlaxPNDMScheduler
+        from .scheduling_sde_ve_flax import FlaxScoreSdeVeScheduler
+        from .scheduling_utils_flax import (
+            FlaxKarrasDiffusionSchedulers,
+            FlaxSchedulerMixin,
+            FlaxSchedulerOutput,
+            broadcast_to_shape_from_left,
+        )
+
+    try:
+        if not (is_torch_available() and is_scipy_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_scipy_objects import *  # noqa F403
+    else:
+        from .scheduling_lms_discrete import LMSDiscreteScheduler
+
+    try:
+        if not (is_torch_available() and is_torchsde_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_torchsde_objects import *  # noqa F403
+    else:
+        from .scheduling_dpmsolver_sde import DPMSolverSDEScheduler
 
+else:
+    import sys
 
-sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
-for name, value in _dummy_modules.items():
-    setattr(sys.modules[__name__], name, value)
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    for name, value in _dummy_modules.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/utils/custom_init_isort.py b/utils/custom_init_isort.py
index f8ef799c5e6c..e1e85974aeed 100644
--- a/utils/custom_init_isort.py
+++ b/utils/custom_init_isort.py
@@ -12,12 +12,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that sorts the imports in the custom inits of Diffusers. Diffusers uses init files that delay the
+import of an object to when it's actually needed. This is to avoid the main init importing all models, which would
+make the line `import transformers` very slow when the user has all optional dependencies installed. The inits with
+delayed imports have two halves: one definining a dictionary `_import_structure` which maps modules to the name of the
+objects in each module, and one in `TYPE_CHECKING` which looks like a normal init for type-checkers. `isort` or `ruff`
+properly sort the second half which looks like traditionl imports, the goal of this script is to sort the first half.
 
+Use from the root of the repo with:
+
+```bash
+python utils/custom_init_isort.py
+```
+
+which will auto-sort the imports (used in `make style`).
+
+For a check only (as used in `make quality`) run:
+
+```bash
+python utils/custom_init_isort.py --check_only
+```
+"""
 import argparse
 import os
 import re
+from typing import Any, Callable, List, Optional
 
 
+# Path is defined with the intent you should run this script from the root of the repo.
 PATH_TO_TRANSFORMERS = "src/diffusers"
 
 # Pattern that looks at the indentation in a line.
@@ -32,17 +55,30 @@
 _re_bracket_content = re.compile(r"\[([^\]]+)\]")
 
 
-def get_indent(line):
-    """Returns the indent in `line`."""
+def get_indent(line: str) -> str:
+    """Returns the indent in  given line (as string)."""
     search = _re_indent.search(line)
     return "" if search is None else search.groups()[0]
 
 
-def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_prompt=None):
+def split_code_in_indented_blocks(
+    code: str, indent_level: str = "", start_prompt: Optional[str] = None, end_prompt: Optional[str] = None
+) -> List[str]:
     """
-    Split `code` into its indented blocks, starting at `indent_level`. If provided, begins splitting after
-    `start_prompt` and stops at `end_prompt` (but returns what's before `start_prompt` as a first block and what's
-    after `end_prompt` as a last block, so `code` is always the same as joining the result of this function).
+    Split some code into its indented blocks, starting at a given level.
+
+    Args:
+        code (`str`): The code to split.
+        indent_level (`str`): The indent level (as string) to use for identifying the blocks to split.
+        start_prompt (`str`, *optional*): If provided, only starts splitting at the line where this text is.
+        end_prompt (`str`, *optional*): If provided, stops splitting at a line where this text is.
+
+    Warning:
+        The text before `start_prompt` or after `end_prompt` (if provided) is not ignored, just not split. The input `code`
+        can thus be retrieved by joining the result.
+
+    Returns:
+        `List[str]`: The list of blocks.
     """
     # Let's split the code into lines and move to start_index.
     index = 0
@@ -54,12 +90,17 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_
     else:
         blocks = []
 
-    # We split into blocks until we get to the `end_prompt` (or the end of the block).
+    # This variable contains the block treated at a given time.
     current_block = [lines[index]]
     index += 1
+    # We split into blocks until we get to the `end_prompt` (or the end of the file).
     while index < len(lines) and (end_prompt is None or not lines[index].startswith(end_prompt)):
+        # We have a non-empty line with the proper indent -> start of a new block
         if len(lines[index]) > 0 and get_indent(lines[index]) == indent_level:
+            # Store the current block in the result and rest. There are two cases: the line is part of the block (like
+            # a closing parenthesis) or not.
             if len(current_block) > 0 and get_indent(current_block[-1]).startswith(indent_level + " "):
+                # Line is part of the current block
                 current_block.append(lines[index])
                 blocks.append("\n".join(current_block))
                 if index < len(lines) - 1:
@@ -68,9 +109,11 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_
                 else:
                     current_block = []
             else:
+                # Line is not part of the current block
                 blocks.append("\n".join(current_block))
                 current_block = [lines[index]]
         else:
+            # Just add the line to the current block
             current_block.append(lines[index])
         index += 1
 
@@ -85,8 +128,10 @@ def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_
     return blocks
 
 
-def ignore_underscore(key):
-    "Wraps a `key` (that maps an object to string) to lower case and remove underscores."
+def ignore_underscore_and_lowercase(key: Callable[[Any], str]) -> Callable[[Any], str]:
+    """
+    Wraps a key function (as used in a sort) to lowercase and ignore underscores.
+    """
 
     def _inner(x):
         return key(x).lower().replace("_", "")
@@ -94,8 +139,21 @@ def _inner(x):
     return _inner
 
 
-def sort_objects(objects, key=None):
-    "Sort a list of `objects` following the rules of isort. `key` optionally maps an object to a str."
+def sort_objects(objects: List[Any], key: Optional[Callable[[Any], str]] = None) -> List[Any]:
+    """
+    Sort a list of objects following the rules of isort (all uppercased first, camel-cased second and lower-cased
+    last).
+
+    Args:
+        objects (`List[Any]`):
+            The list of objects to sort.
+        key (`Callable[[Any], str]`, *optional*):
+            A function taking an object as input and returning a string, used to sort them by alphabetical order.
+            If not provided, will default to noop (so a `key` must be provided if the `objects` are not of type string).
+
+    Returns:
+        `List[Any]`: The sorted list with the same elements as in the inputs
+    """
 
     # If no key is provided, we use a noop.
     def noop(x):
@@ -110,18 +168,26 @@ def noop(x):
     # Functions begin with a lowercase, they go last.
     functions = [obj for obj in objects if not key(obj)[0].isupper()]
 
-    key1 = ignore_underscore(key)
+    # Then we sort each group.
+    key1 = ignore_underscore_and_lowercase(key)
     return sorted(constants, key=key1) + sorted(classes, key=key1) + sorted(functions, key=key1)
 
 
-def sort_objects_in_import(import_statement):
+def sort_objects_in_import(import_statement: str) -> str:
     """
-    Return the same `import_statement` but with objects properly sorted.
+    Sorts the imports in a single import statement.
+
+    Args:
+        import_statement (`str`): The import statement in which to sort the imports.
+
+    Returns:
+        `str`: The same as the input, but with objects properly sorted.
     """
 
     # This inner function sort imports between [ ].
     def _replace(match):
         imports = match.groups()[0]
+        # If there is one import only, nothing to do.
         if "," not in imports:
             return f"[{imports}]"
         keys = [part.strip().replace('"', "") for part in imports.split(",")]
@@ -165,13 +231,18 @@ def _replace(match):
         return import_statement
 
 
-def sort_imports(file, check_only=True):
+def sort_imports(file: str, check_only: bool = True):
     """
-    Sort `_import_structure` imports in `file`, `check_only` determines if we only check or overwrite.
+    Sort the imports defined in the `_import_structure` of a given init.
+
+    Args:
+        file (`str`): The path to the init to check/fix.
+        check_only (`bool`, *optional*, defaults to `True`): Whether or not to just check (and not auto-fix) the init.
     """
-    with open(file, "r") as f:
+    with open(file, encoding="utf-8") as f:
         code = f.read()
 
+    # If the file is not a custom init, there is nothing to do.
     if "_import_structure" not in code:
         return
 
@@ -180,7 +251,7 @@ def sort_imports(file, check_only=True):
         code, start_prompt="_import_structure = {", end_prompt="if TYPE_CHECKING:"
     )
 
-    # We ignore block 0 (everything until start_prompt) and the last block (everything after end_prompt).
+    # We ignore block 0 (everything untils start_prompt) and the last block (everything after end_prompt).
     for block_idx in range(1, len(main_blocks) - 1):
         # Check if the block contains some `_import_structure`s thingy to sort.
         block = main_blocks[block_idx]
@@ -203,7 +274,7 @@ def sort_imports(file, check_only=True):
         # Slit the internal block into blocks of indent level 1.
         internal_blocks = split_code_in_indented_blocks(internal_block_code, indent_level=indent)
         # We have two categories of import key: list or _import_structure[key].append/extend
-        pattern = _re_direct_key if "_import_structure" in block_lines[0] else _re_indirect_key
+        pattern = _re_direct_key if "_import_structure = {" in block_lines[0] else _re_indirect_key
         # Grab the keys, but there is a trap: some lines are empty or just comments.
         keys = [(pattern.search(b).groups()[0] if pattern.search(b) is not None else None) for b in internal_blocks]
         # We only sort the lines with a key.
@@ -229,11 +300,17 @@ def sort_imports(file, check_only=True):
             return True
         else:
             print(f"Overwriting {file}.")
-            with open(file, "w") as f:
+            with open(file, "w", encoding="utf-8") as f:
                 f.write("\n".join(main_blocks))
 
 
 def sort_imports_in_all_inits(check_only=True):
+    """
+    Sort the imports defined in the `_import_structure` of all inits in the repo.
+
+    Args:
+        check_only (`bool`, *optional*, defaults to `True`): Whether or not to just check (and not auto-fix) the init.
+    """
     failures = []
     for root, _, files in os.walk(PATH_TO_TRANSFORMERS):
         if "__init__.py" in files:

From c806f2fad6040d50b3d291076cab0195863ba328 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 12 Sep 2023 18:35:29 +0530
Subject: [PATCH 18/37] remove extra gligen in import (#4987)

---
 src/diffusers/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 87feab66503b..9c1d1fe2e757 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -236,7 +236,6 @@
             "StableDiffusionDepth2ImgPipeline",
             "StableDiffusionDiffEditPipeline",
             "StableDiffusionGLIGENPipeline",
-            "StableDiffusionGLIGENPipeline",
             "StableDiffusionGLIGENTextImagePipeline",
             "StableDiffusionImageVariationPipeline",
             "StableDiffusionImg2ImgPipeline",

From 73bf620dec56ee3a24b88bee53763617332223cc Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 12 Sep 2023 16:52:25 +0200
Subject: [PATCH 19/37] fix E721 Do not compare types, use `isinstance()`
 (#4992)

---
 examples/community/lpw_stable_diffusion_xl.py               | 2 +-
 examples/community/stable_diffusion_xl_reference.py         | 2 +-
 src/diffusers/experimental/rl/value_guided_sampling.py      | 2 +-
 .../pipelines/audio_diffusion/pipeline_audio_diffusion.py   | 2 +-
 .../stable_diffusion_xl/pipeline_stable_diffusion_xl.py     | 2 +-
 .../pipeline_stable_diffusion_xl_img2img.py                 | 2 +-
 .../pipeline_stable_diffusion_xl_inpaint.py                 | 2 +-
 .../pipeline_stable_diffusion_xl_instruct_pix2pix.py        | 2 +-
 .../t2i_adapter/pipeline_stable_diffusion_xl_adapter.py     | 2 +-
 .../pipelines/consistency_models/test_consistency_models.py | 2 +-
 tests/pipelines/unidiffuser/test_unidiffuser.py             | 6 +++---
 11 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/community/lpw_stable_diffusion_xl.py b/examples/community/lpw_stable_diffusion_xl.py
index 2ee44b95ab0a..61a49eb2b3bf 100644
--- a/examples/community/lpw_stable_diffusion_xl.py
+++ b/examples/community/lpw_stable_diffusion_xl.py
@@ -1138,7 +1138,7 @@ def __call__(
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
 
         # 7.1 Apply denoising_end
-        if denoising_end is not None and type(denoising_end) == float and denoising_end > 0 and denoising_end < 1:
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
             discrete_timestep_cutoff = int(
                 round(
                     self.scheduler.config.num_train_timesteps
diff --git a/examples/community/stable_diffusion_xl_reference.py b/examples/community/stable_diffusion_xl_reference.py
index 7549135b220f..a7654f11bcc9 100644
--- a/examples/community/stable_diffusion_xl_reference.py
+++ b/examples/community/stable_diffusion_xl_reference.py
@@ -701,7 +701,7 @@ def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb=
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
 
         # 10.1 Apply denoising_end
-        if denoising_end is not None and type(denoising_end) == float and denoising_end > 0 and denoising_end < 1:
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
             discrete_timestep_cutoff = int(
                 round(
                     self.scheduler.config.num_train_timesteps
diff --git a/src/diffusers/experimental/rl/value_guided_sampling.py b/src/diffusers/experimental/rl/value_guided_sampling.py
index 262039be4fdb..dfb27587d7d5 100644
--- a/src/diffusers/experimental/rl/value_guided_sampling.py
+++ b/src/diffusers/experimental/rl/value_guided_sampling.py
@@ -76,7 +76,7 @@ def de_normalize(self, x_in, key):
         return x_in * self.stds[key] + self.means[key]
 
     def to_torch(self, x_in):
-        if type(x_in) is dict:
+        if isinstance(x_in, dict):
             return {k: self.to_torch(v) for k, v in x_in.items()}
         elif torch.is_tensor(x_in):
             return x_in.to(self.unet.device)
diff --git a/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py b/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
index a06217c19bf7..6c4ae88b228d 100644
--- a/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
+++ b/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
@@ -178,7 +178,7 @@ def __call__(
         self.scheduler.set_timesteps(steps)
         step_generator = step_generator or generator
         # For backwards compatibility
-        if type(self.unet.config.sample_size) == int:
+        if isinstance(self.unet.config.sample_size, int):
             self.unet.config.sample_size = (self.unet.config.sample_size, self.unet.config.sample_size)
         if noise is None:
             noise = randn_tensor(
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 10e966b248ab..84fc9c7c5788 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -810,7 +810,7 @@ def __call__(
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
 
         # 7.1 Apply denoising_end
-        if denoising_end is not None and type(denoising_end) == float and denoising_end > 0 and denoising_end < 1:
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
             discrete_timestep_cutoff = int(
                 round(
                     self.scheduler.config.num_train_timesteps
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 8e26a2ad067d..4b66193f75a9 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -885,7 +885,7 @@ def __call__(
 
         # 5. Prepare timesteps
         def denoising_value_valid(dnv):
-            return type(denoising_end) == float and 0 < dnv < 1
+            return isinstance(denoising_end, float) and 0 < dnv < 1
 
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps, num_inference_steps = self.get_timesteps(
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 6fdc688d9eae..55baada04294 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -1120,7 +1120,7 @@ def __call__(
 
         # 4. set timesteps
         def denoising_value_valid(dnv):
-            return type(denoising_end) == float and 0 < dnv < 1
+            return isinstance(denoising_end, float) and 0 < dnv < 1
 
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps, num_inference_steps = self.get_timesteps(
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
index 614cc0e6477c..786231dd5c15 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -837,7 +837,7 @@ def __call__(
 
         # 11. Denoising loop
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-        if denoising_end is not None and type(denoising_end) == float and denoising_end > 0 and denoising_end < 1:
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
             discrete_timestep_cutoff = int(
                 round(
                     self.scheduler.config.num_train_timesteps
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
index 4bf0e3311865..d7441db70741 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -886,7 +886,7 @@ def __call__(
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
 
         # 7.1 Apply denoising_end
-        if denoising_end is not None and type(denoising_end) == float and denoising_end > 0 and denoising_end < 1:
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
             discrete_timestep_cutoff = int(
                 round(
                     self.scheduler.config.num_train_timesteps
diff --git a/tests/pipelines/consistency_models/test_consistency_models.py b/tests/pipelines/consistency_models/test_consistency_models.py
index 6732d5228d50..59be333b629b 100644
--- a/tests/pipelines/consistency_models/test_consistency_models.py
+++ b/tests/pipelines/consistency_models/test_consistency_models.py
@@ -193,7 +193,7 @@ def get_inputs(self, seed=0, get_fixed_latents=False, device="cpu", dtype=torch.
         return inputs
 
     def get_fixed_latents(self, seed=0, device="cpu", dtype=torch.float32, shape=(1, 3, 64, 64)):
-        if type(device) == str:
+        if isinstance(device, str):
             device = torch.device(device)
         generator = torch.Generator(device=device).manual_seed(seed)
         latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
diff --git a/tests/pipelines/unidiffuser/test_unidiffuser.py b/tests/pipelines/unidiffuser/test_unidiffuser.py
index 865a7cfa6933..eec341db50db 100644
--- a/tests/pipelines/unidiffuser/test_unidiffuser.py
+++ b/tests/pipelines/unidiffuser/test_unidiffuser.py
@@ -109,7 +109,7 @@ def get_dummy_inputs(self, device, seed=0):
         return inputs
 
     def get_fixed_latents(self, device, seed=0):
-        if type(device) == str:
+        if isinstance(device, str):
             device = torch.device(device)
         generator = torch.Generator(device=device).manual_seed(seed)
         # Hardcode the shapes for now.
@@ -545,7 +545,7 @@ def get_inputs(self, device, seed=0, generate_latents=False):
         return inputs
 
     def get_fixed_latents(self, device, seed=0):
-        if type(device) == str:
+        if isinstance(device, str):
             device = torch.device(device)
         latent_device = torch.device("cpu")
         generator = torch.Generator(device=latent_device).manual_seed(seed)
@@ -648,7 +648,7 @@ def get_inputs(self, device, seed=0, generate_latents=False):
         return inputs
 
     def get_fixed_latents(self, device, seed=0):
-        if type(device) == str:
+        if isinstance(device, str):
             device = torch.device(device)
         latent_device = torch.device("cpu")
         generator = torch.Generator(device=latent_device).manual_seed(seed)

From 5d28d2217f328b0d5126ac22f018a431fa9520fe Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 12 Sep 2023 16:55:13 +0200
Subject: [PATCH 20/37] [Wuerstchen] fix combined pipeline's
 num_images_per_prompt (#4989)

* fix encode_prompt

* added prompt_embeds and negative_prompt_embeds

* prompt_embeds for the prior only
---
 .../wuerstchen/pipeline_wuerstchen.py         |  6 +-
 .../pipeline_wuerstchen_combined.py           | 17 +++-
 .../wuerstchen/pipeline_wuerstchen_prior.py   | 99 ++++++++++++-------
 3 files changed, 84 insertions(+), 38 deletions(-)

diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
index 4dfa4727ddd0..55e4b01f3da2 100644
--- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
+++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
@@ -330,7 +330,11 @@ def __call__(
 
         # 2. Encode caption
         prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            prompt,
+            device,
+            image_embeddings.size(0) * num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
         )
         text_encoder_hidden_states = (
             torch.cat([prompt_embeds, negative_prompt_embeds]) if negative_prompt_embeds is not None else prompt_embeds
diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
index 590162bd0d16..155dc88345eb 100644
--- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
+++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
@@ -154,6 +154,8 @@ def __call__(
         decoder_timesteps: Optional[List[float]] = None,
         decoder_guidance_scale: float = 0.0,
         negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         num_images_per_prompt: int = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
@@ -165,10 +167,17 @@ def __call__(
 
         Args:
             prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
+                The prompt or prompts to guide the image generation for the prior and decoder.
             negative_prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                 if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             height (`int`, *optional*, defaults to 512):
@@ -221,13 +230,15 @@ def __call__(
             otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
         prior_outputs = self.prior_pipe(
-            prompt=prompt,
+            prompt=prompt if prompt_embeds is None else None,
             height=height,
             width=width,
             num_inference_steps=prior_num_inference_steps,
             timesteps=prior_timesteps,
             guidance_scale=prior_guidance_scale,
-            negative_prompt=negative_prompt,
+            negative_prompt=negative_prompt if negative_prompt_embeds is None else None,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
             num_images_per_prompt=num_images_per_prompt,
             generator=generator,
             latents=latents,
diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
index 9993d30b2072..46a6885c1f39 100644
--- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
+++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
@@ -150,41 +150,57 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
 
     def encode_prompt(
         self,
-        prompt,
         device,
         num_images_per_prompt,
         do_classifier_free_guidance,
+        prompt=None,
         negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
     ):
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids
-        attention_mask = text_inputs.attention_mask
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            attention_mask = text_inputs.attention_mask
 
-        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+                attention_mask = attention_mask[:, : self.tokenizer.model_max_length]
 
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
-            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            text_encoder_output = self.text_encoder(
+                text_input_ids.to(device), attention_mask=attention_mask.to(device)
             )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-            attention_mask = attention_mask[:, : self.tokenizer.model_max_length]
+            prompt_embeds = text_encoder_output.last_hidden_state
 
-        text_encoder_output = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask.to(device))
-        text_encoder_hidden_states = text_encoder_output.last_hidden_state
-        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
-        uncond_text_encoder_hidden_states = None
-        if do_classifier_free_guidance:
+        if negative_prompt_embeds is None and do_classifier_free_guidance:
             uncond_tokens: List[str]
             if negative_prompt is None:
                 uncond_tokens = [""] * batch_size
@@ -215,17 +231,17 @@ def encode_prompt(
                 uncond_input.input_ids.to(device), attention_mask=uncond_input.attention_mask.to(device)
             )
 
-            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.last_hidden_state
 
+        if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = uncond_text_encoder_hidden_states.shape[1]
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
-                batch_size * num_images_per_prompt, seq_len, -1
-            )
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
             # done duplicates
 
-        return text_encoder_hidden_states, uncond_text_encoder_hidden_states
+        return prompt_embeds, negative_prompt_embeds
 
     def check_inputs(
         self,
@@ -264,13 +280,15 @@ def check_inputs(
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
-        prompt: Union[str, List[str]] = None,
+        prompt: Optional[Union[str, List[str]]] = None,
         height: int = 1024,
         width: int = 1024,
         num_inference_steps: int = 60,
         timesteps: List[float] = None,
         guidance_scale: float = 8.0,
         negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         num_images_per_prompt: Optional[int] = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
@@ -304,6 +322,13 @@ def __call__(
             negative_prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                 if `decoder_guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -345,7 +370,13 @@ def __call__(
 
         # 2. Encode caption
         prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
         )
 
         # For classifier free guidance, we need to do two forward passes.

From b1105269b73b4851a203d6679238ff39918cdbd2 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 12 Sep 2023 14:55:27 +0000
Subject: [PATCH 21/37] make style

---
 .../wuerstchen/pipeline_wuerstchen_combined.py         | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
index 155dc88345eb..0695e6379668 100644
--- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
+++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
@@ -172,12 +172,12 @@ def __call__(
                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                 if `guidance_scale` is less than `1`).
             prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
+                Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, text embeddings will be generated from `prompt` input argument.
             negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+                Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
+                prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             height (`int`, *optional*, defaults to 512):

From 4d897aaff5e6e635c285a26d72eb54671b162844 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 12 Sep 2023 21:15:47 +0530
Subject: [PATCH 22/37] fix image variation slow test (#4995)

fix image variation tests

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../test_stable_diffusion_image_variation.py  | 40 ++++++++++++-------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
index b6d6c7b80c98..cd688c3beb37 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
@@ -35,6 +35,8 @@
     load_image,
     load_numpy,
     nightly,
+    numpy_cosine_similarity_distance,
+    print_tensor_test,
     require_torch_gpu,
     slow,
     torch_device,
@@ -182,7 +184,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0
             "generator": generator,
             "num_inference_steps": 3,
             "guidance_scale": 7.5,
-            "output_type": "numpy",
+            "output_type": "np",
         }
         return inputs
 
@@ -193,13 +195,17 @@ def test_stable_diffusion_img_variation_pipeline_default(self):
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
-        inputs = self.get_inputs(torch_device)
+        generator_device = "cpu"
+        inputs = self.get_inputs(generator_device)
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
 
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.84491, 0.90789, 0.75708, 0.78734, 0.83485, 0.70099, 0.66938, 0.68727, 0.61379])
-        assert np.abs(image_slice - expected_slice).max() < 6e-3
+        expected_slice = np.array([0.8449, 0.9079, 0.7571, 0.7873, 0.8348, 0.7010, 0.6694, 0.6873, 0.6138])
+        print_tensor_test(image_slice)
+
+        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
+        assert max_diff < 1e-4
 
     def test_stable_diffusion_img_variation_intermediate_state(self):
         number_of_steps = 0
@@ -212,31 +218,36 @@ def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.1621, 0.2837, -0.7979, -0.1221, -1.3057, 0.7681, -2.1191, 0.0464, 1.6309]
-                )
+                expected_slice = np.array([-0.7974, -0.4343, -1.087, 0.04785, -1.327, 0.855, -2.148, -0.1725, 1.439])
+                max_diff = numpy_cosine_similarity_distance(latents_slice.flatten(), expected_slice)
+
+                assert max_diff < 1e-3
 
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
             elif step == 2:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.6299, 1.7500, 1.1992, -2.1582, -1.8994, 0.7334, -0.7090, 1.0137, 1.5273])
+                expected_slice = np.array([0.3232, 0.004883, 0.913, -1.084, 0.6143, -1.6875, -2.463, -0.439, -0.419])
+                max_diff = numpy_cosine_similarity_distance(latents_slice.flatten(), expected_slice)
 
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+                assert max_diff < 1e-3
 
         callback_fn.has_been_called = False
 
         pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-            "fusing/sd-image-variations-diffusers",
+            "lambdalabs/sd-image-variations-diffusers",
             safety_checker=None,
             torch_dtype=torch.float16,
         )
+
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
 
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        generator_device = "cpu"
+        inputs = self.get_inputs(generator_device, dtype=torch.float16)
         pipe(**inputs, callback=callback_fn, callback_steps=1)
         assert callback_fn.has_been_called
         assert number_of_steps == inputs["num_inference_steps"]
@@ -246,9 +257,8 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         torch.cuda.reset_max_memory_allocated()
         torch.cuda.reset_peak_memory_stats()
 
-        model_id = "fusing/sd-image-variations-diffusers"
         pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-            model_id, safety_checker=None, torch_dtype=torch.float16
+            "lambdalabs/sd-image-variations-diffusers", safety_checker=None, torch_dtype=torch.float16
         )
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)

From f64d52dbca93051a7652db7aa241964235a71035 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 12 Sep 2023 21:20:47 +0530
Subject: [PATCH 23/37] fix custom diffusion tests (#4996)

---
 tests/models/test_models_unet_2d_condition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index f0f91a3a86a1..8aa2099154a1 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -785,8 +785,8 @@ def test_custom_diffusion_save_load(self):
             self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_custom_diffusion_weights.bin")))
             torch.manual_seed(0)
             new_model = self.model_class(**init_dict)
-            new_model.to(torch_device)
             new_model.load_attn_procs(tmpdirname, weight_name="pytorch_custom_diffusion_weights.bin")
+            new_model.to(torch_device)
 
         with torch.no_grad():
             new_sample = new_model(**inputs_dict).sample

From 37cb819df5309af1f8963d11bd42ba20b5cd8b2a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 12 Sep 2023 17:51:15 +0200
Subject: [PATCH 24/37] [Lora] Speed up lora loading (#4994)

* speed up lora loading

* Apply suggestions from code review

* up

* up

* Fix more

* Correct more

* Apply suggestions from code review

* up

* Fix more

* Fix more -

* up

* up
---
 src/diffusers/loaders.py               | 203 ++++++++++++++-----------
 src/diffusers/models/modeling_utils.py |  54 ++++---
 2 files changed, 148 insertions(+), 109 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 52c140a6782a..16eabb0077bb 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
 import os
 import re
 import warnings
@@ -27,6 +26,7 @@
 from huggingface_hub import hf_hub_download, model_info
 from torch import nn
 
+from .models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
 from .utils import (
     DIFFUSERS_CACHE,
     HF_HUB_OFFLINE,
@@ -46,7 +46,6 @@
 if is_accelerate_available():
     from accelerate import init_empty_weights
     from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
-    from accelerate.utils import set_module_tensor_to_device
 
 logger = logging.get_logger(__name__)
 
@@ -137,7 +136,6 @@ def _unfuse_lora(self):
         self.w_down = None
 
     def forward(self, input):
-        # print(f"{self.__class__.__name__} has a lora_scale of {self.lora_scale}")
         if self.lora_scale is None:
             self.lora_scale = 1.0
         if self.lora_linear_layer is None:
@@ -274,6 +272,11 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
             use_auth_token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
                 `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
             revision (`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
                 allowed by Git.
@@ -300,6 +303,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
         subfolder = kwargs.pop("subfolder", None)
         weight_name = kwargs.pop("weight_name", None)
         use_safetensors = kwargs.pop("use_safetensors", None)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
         # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
         # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
         network_alphas = kwargs.pop("network_alphas", None)
@@ -316,6 +320,15 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
             "framework": "pytorch",
         }
 
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+
         model_file = None
         if not isinstance(pretrained_model_name_or_path_or_dict, dict):
             # Let's first try to load .safetensors weights
@@ -370,6 +383,10 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
             # correct keys
             state_dict, network_alphas = self.convert_state_dict_legacy_attn_format(state_dict, network_alphas)
 
+            if network_alphas is not None:
+                network_alphas_keys = list(network_alphas.keys())
+                used_network_alphas_keys = set()
+
             lora_grouped_dict = defaultdict(dict)
             mapped_network_alphas = {}
 
@@ -381,13 +398,13 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
 
                 # Create another `mapped_network_alphas` dictionary so that we can properly map them.
                 if network_alphas is not None:
-                    network_alphas_ = copy.deepcopy(network_alphas)
-                    for k in network_alphas_:
+                    for k in network_alphas_keys:
                         if k.replace(".alpha", "") in key:
-                            mapped_network_alphas.update({attn_processor_key: network_alphas.pop(k)})
+                            mapped_network_alphas.update({attn_processor_key: network_alphas.get(k)})
+                            used_network_alphas_keys.add(k)
 
             if not is_network_alphas_none:
-                if len(network_alphas) > 0:
+                if len(set(network_alphas_keys) - used_network_alphas_keys) > 0:
                     raise ValueError(
                         f"The `network_alphas` has to be empty at this point but has the following keys \n\n {', '.join(network_alphas.keys())}"
                     )
@@ -411,29 +428,38 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
                     out_features = attn_processor.out_channels
                     kernel_size = attn_processor.kernel_size
 
-                    lora = LoRAConv2dLayer(
-                        in_features=in_features,
-                        out_features=out_features,
-                        rank=rank,
-                        kernel_size=kernel_size,
-                        stride=attn_processor.stride,
-                        padding=attn_processor.padding,
-                        network_alpha=mapped_network_alphas.get(key),
-                    )
+                    ctx = init_empty_weights if low_cpu_mem_usage else nullcontext
+                    with ctx():
+                        lora = LoRAConv2dLayer(
+                            in_features=in_features,
+                            out_features=out_features,
+                            rank=rank,
+                            kernel_size=kernel_size,
+                            stride=attn_processor.stride,
+                            padding=attn_processor.padding,
+                            network_alpha=mapped_network_alphas.get(key),
+                        )
                 elif isinstance(attn_processor, LoRACompatibleLinear):
-                    lora = LoRALinearLayer(
-                        attn_processor.in_features,
-                        attn_processor.out_features,
-                        rank,
-                        mapped_network_alphas.get(key),
-                    )
+                    ctx = init_empty_weights if low_cpu_mem_usage else nullcontext
+                    with ctx():
+                        lora = LoRALinearLayer(
+                            attn_processor.in_features,
+                            attn_processor.out_features,
+                            rank,
+                            mapped_network_alphas.get(key),
+                        )
                 else:
                     raise ValueError(f"Module {key} is not a LoRACompatibleConv or LoRACompatibleLinear module.")
 
                 value_dict = {k.replace("lora.", ""): v for k, v in value_dict.items()}
-                lora.load_state_dict(value_dict)
                 lora_layers_list.append((attn_processor, lora))
 
+                if low_cpu_mem_usage:
+                    device = next(iter(value_dict.values())).device
+                    dtype = next(iter(value_dict.values())).dtype
+                    load_model_dict_into_meta(lora, value_dict, device=device, dtype=dtype)
+                else:
+                    lora.load_state_dict(value_dict)
         elif is_custom_diffusion:
             attn_processors = {}
             custom_diffusion_grouped_dict = defaultdict(dict)
@@ -470,13 +496,12 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
                 f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training."
             )
 
-        # set correct dtype & device
-        lora_layers_list = [(t, l.to(device=self.device, dtype=self.dtype)) for t, l in lora_layers_list]
-
         # set lora layers
         for target_module, lora_layer in lora_layers_list:
             target_module.set_lora_layer(lora_layer)
 
+        self.to(dtype=self.dtype, device=self.device)
+
     def convert_state_dict_legacy_attn_format(self, state_dict, network_alphas):
         is_new_lora_format = all(
             key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
@@ -999,13 +1024,18 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
                     recurive = is_sequential_cpu_offload
                     remove_hook_from_module(component, recurse=recurive)
 
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+
         state_dict, network_alphas = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
-        self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
+        self.load_lora_into_unet(
+            state_dict, network_alphas=network_alphas, unet=self.unet, low_cpu_mem_usage=low_cpu_mem_usage
+        )
         self.load_lora_into_text_encoder(
             state_dict,
             network_alphas=network_alphas,
             text_encoder=self.text_encoder,
             lora_scale=self.lora_scale,
+            low_cpu_mem_usage=low_cpu_mem_usage,
         )
 
         # Offload back.
@@ -1065,6 +1095,11 @@ def lora_state_dict(
                 allowed by Git.
             subfolder (`str`, *optional*, defaults to `""`):
                 The subfolder location of a model file within a larger model repository on the Hub or locally.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
             mirror (`str`, *optional*):
                 Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
                 guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
@@ -1305,7 +1340,7 @@ def _maybe_map_sgm_blocks_to_diffusers(cls, state_dict, unet_config, delimiter="
         return new_state_dict
 
     @classmethod
-    def load_lora_into_unet(cls, state_dict, network_alphas, unet):
+    def load_lora_into_unet(cls, state_dict, network_alphas, unet, low_cpu_mem_usage=None):
         """
         This will load the LoRA layers specified in `state_dict` into `unet`.
 
@@ -1318,7 +1353,13 @@ def load_lora_into_unet(cls, state_dict, network_alphas, unet):
                 See `LoRALinearLayer` for more details.
             unet (`UNet2DConditionModel`):
                 The UNet model to load the LoRA layers into.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
         """
+        low_cpu_mem_usage = low_cpu_mem_usage if low_cpu_mem_usage is not None else _LOW_CPU_MEM_USAGE_DEFAULT
         # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
         # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
         # their prefixes.
@@ -1343,11 +1384,12 @@ def load_lora_into_unet(cls, state_dict, network_alphas, unet):
             warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet.{module_name}': params for module_name, params in old_state_dict.items()}`."
             warnings.warn(warn_message)
 
-        # load loras into unet
-        unet.load_attn_procs(state_dict, network_alphas=network_alphas)
+        unet.load_attn_procs(state_dict, network_alphas=network_alphas, low_cpu_mem_usage=low_cpu_mem_usage)
 
     @classmethod
-    def load_lora_into_text_encoder(cls, state_dict, network_alphas, text_encoder, prefix=None, lora_scale=1.0):
+    def load_lora_into_text_encoder(
+        cls, state_dict, network_alphas, text_encoder, prefix=None, lora_scale=1.0, low_cpu_mem_usage=None
+    ):
         """
         This will load the LoRA layers specified in `state_dict` into `text_encoder`
 
@@ -1364,7 +1406,13 @@ def load_lora_into_text_encoder(cls, state_dict, network_alphas, text_encoder, p
             lora_scale (`float`):
                 How much to scale the output of the lora linear layer before it is added with the output of the regular
                 lora layer.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
         """
+        low_cpu_mem_usage = low_cpu_mem_usage if low_cpu_mem_usage is not None else _LOW_CPU_MEM_USAGE_DEFAULT
 
         # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
         # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
@@ -1447,6 +1495,7 @@ def load_lora_into_text_encoder(cls, state_dict, network_alphas, text_encoder, p
                     network_alphas,
                     rank=rank,
                     patch_mlp=patch_mlp,
+                    low_cpu_mem_usage=low_cpu_mem_usage,
                 )
 
                 # set correct dtype & device
@@ -1454,12 +1503,23 @@ def load_lora_into_text_encoder(cls, state_dict, network_alphas, text_encoder, p
                     k: v.to(device=text_encoder.device, dtype=text_encoder.dtype)
                     for k, v in text_encoder_lora_state_dict.items()
                 }
-                load_state_dict_results = text_encoder.load_state_dict(text_encoder_lora_state_dict, strict=False)
-                if len(load_state_dict_results.unexpected_keys) != 0:
+                if low_cpu_mem_usage:
+                    device = next(iter(text_encoder_lora_state_dict.values())).device
+                    dtype = next(iter(text_encoder_lora_state_dict.values())).dtype
+                    unexpected_keys = load_model_dict_into_meta(
+                        text_encoder, text_encoder_lora_state_dict, device=device, dtype=dtype
+                    )
+                else:
+                    load_state_dict_results = text_encoder.load_state_dict(text_encoder_lora_state_dict, strict=False)
+                    unexpected_keys = load_state_dict_results.unexpected_keys
+
+                if len(unexpected_keys) != 0:
                     raise ValueError(
                         f"failed to load text encoder state dict, unexpected keys: {load_state_dict_results.unexpected_keys}"
                     )
 
+                text_encoder.to(device=text_encoder.device, dtype=text_encoder.dtype)
+
     @property
     def lora_scale(self) -> float:
         # property function that returns the lora scale which can be set at run time by the pipeline.
@@ -1492,11 +1552,21 @@ def _modify_text_encoder(
         rank: Union[Dict[str, int], int] = 4,
         dtype=None,
         patch_mlp=False,
+        low_cpu_mem_usage=False,
     ):
         r"""
         Monkey-patches the forward passes of attention modules of the text encoder.
         """
 
+        def create_patched_linear_lora(model, network_alpha, rank, dtype, lora_parameters):
+            linear_layer = model.regular_linear_layer if isinstance(model, PatchedLoraProjection) else model
+            ctx = init_empty_weights if low_cpu_mem_usage else nullcontext
+            with ctx():
+                model = PatchedLoraProjection(linear_layer, lora_scale, network_alpha, rank, dtype=dtype)
+
+            lora_parameters.extend(model.lora_linear_layer.parameters())
+            return model
+
         # First, remove any monkey-patch that might have been applied before
         cls._remove_text_encoder_monkey_patch_classmethod(text_encoder)
 
@@ -1515,45 +1585,18 @@ def _modify_text_encoder(
             else:
                 current_rank = rank
 
-            q_linear_layer = (
-                attn_module.q_proj.regular_linear_layer
-                if isinstance(attn_module.q_proj, PatchedLoraProjection)
-                else attn_module.q_proj
+            attn_module.q_proj = create_patched_linear_lora(
+                attn_module.q_proj, query_alpha, current_rank, dtype, lora_parameters
             )
-            attn_module.q_proj = PatchedLoraProjection(
-                q_linear_layer, lora_scale, network_alpha=query_alpha, rank=current_rank, dtype=dtype
+            attn_module.k_proj = create_patched_linear_lora(
+                attn_module.k_proj, key_alpha, current_rank, dtype, lora_parameters
             )
-            lora_parameters.extend(attn_module.q_proj.lora_linear_layer.parameters())
-
-            k_linear_layer = (
-                attn_module.k_proj.regular_linear_layer
-                if isinstance(attn_module.k_proj, PatchedLoraProjection)
-                else attn_module.k_proj
+            attn_module.v_proj = create_patched_linear_lora(
+                attn_module.v_proj, value_alpha, current_rank, dtype, lora_parameters
             )
-            attn_module.k_proj = PatchedLoraProjection(
-                k_linear_layer, lora_scale, network_alpha=key_alpha, rank=current_rank, dtype=dtype
+            attn_module.out_proj = create_patched_linear_lora(
+                attn_module.out_proj, out_alpha, current_rank, dtype, lora_parameters
             )
-            lora_parameters.extend(attn_module.k_proj.lora_linear_layer.parameters())
-
-            v_linear_layer = (
-                attn_module.v_proj.regular_linear_layer
-                if isinstance(attn_module.v_proj, PatchedLoraProjection)
-                else attn_module.v_proj
-            )
-            attn_module.v_proj = PatchedLoraProjection(
-                v_linear_layer, lora_scale, network_alpha=value_alpha, rank=current_rank, dtype=dtype
-            )
-            lora_parameters.extend(attn_module.v_proj.lora_linear_layer.parameters())
-
-            out_linear_layer = (
-                attn_module.out_proj.regular_linear_layer
-                if isinstance(attn_module.out_proj, PatchedLoraProjection)
-                else attn_module.out_proj
-            )
-            attn_module.out_proj = PatchedLoraProjection(
-                out_linear_layer, lora_scale, network_alpha=out_alpha, rank=current_rank, dtype=dtype
-            )
-            lora_parameters.extend(attn_module.out_proj.lora_linear_layer.parameters())
 
         if patch_mlp:
             for name, mlp_module in text_encoder_mlp_modules(text_encoder):
@@ -1563,25 +1606,12 @@ def _modify_text_encoder(
                 current_rank_fc1 = rank.pop(f"{name}.fc1.lora_linear_layer.up.weight")
                 current_rank_fc2 = rank.pop(f"{name}.fc2.lora_linear_layer.up.weight")
 
-                fc1_linear_layer = (
-                    mlp_module.fc1.regular_linear_layer
-                    if isinstance(mlp_module.fc1, PatchedLoraProjection)
-                    else mlp_module.fc1
-                )
-                mlp_module.fc1 = PatchedLoraProjection(
-                    fc1_linear_layer, lora_scale, network_alpha=fc1_alpha, rank=current_rank_fc1, dtype=dtype
-                )
-                lora_parameters.extend(mlp_module.fc1.lora_linear_layer.parameters())
-
-                fc2_linear_layer = (
-                    mlp_module.fc2.regular_linear_layer
-                    if isinstance(mlp_module.fc2, PatchedLoraProjection)
-                    else mlp_module.fc2
+                mlp_module.fc1 = create_patched_linear_lora(
+                    mlp_module.fc1, fc1_alpha, current_rank_fc1, dtype, lora_parameters
                 )
-                mlp_module.fc2 = PatchedLoraProjection(
-                    fc2_linear_layer, lora_scale, network_alpha=fc2_alpha, rank=current_rank_fc2, dtype=dtype
+                mlp_module.fc2 = create_patched_linear_lora(
+                    mlp_module.fc2, fc2_alpha, current_rank_fc2, dtype, lora_parameters
                 )
-                lora_parameters.extend(mlp_module.fc2.lora_linear_layer.parameters())
 
         if is_network_alphas_populated and len(network_alphas) > 0:
             raise ValueError(
@@ -2375,8 +2405,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             vae = AutoencoderKL(**vae_config)
 
         if is_accelerate_available():
-            for param_name, param in converted_vae_checkpoint.items():
-                set_module_tensor_to_device(vae, param_name, "cpu", value=param)
+            load_model_dict_into_meta(vae, converted_vae_checkpoint, device="cpu")
         else:
             vae.load_state_dict(converted_vae_checkpoint)
 
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index e53fa7e528b7..67746ebacef2 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -128,6 +128,31 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[
             )
 
 
+def load_model_dict_into_meta(model, state_dict, device=None, dtype=None, model_name_or_path=None):
+    device = device or torch.device("cpu")
+    dtype = dtype or torch.float32
+
+    unexpected_keys = []
+    empty_state_dict = model.state_dict()
+    for param_name, param in state_dict.items():
+        if param_name not in empty_state_dict:
+            unexpected_keys.append(param_name)
+            continue
+
+        if empty_state_dict[param_name].shape != param.shape:
+            model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
+            raise ValueError(
+                f"Cannot load {model_name_or_path_str}because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
+            )
+
+        accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
+        if accepts_dtype:
+            set_module_tensor_to_device(model, param_name, device, value=param, dtype=dtype)
+        else:
+            set_module_tensor_to_device(model, param_name, device, value=param)
+    return unexpected_keys
+
+
 def _load_state_dict_into_model(model_to_load, state_dict):
     # Convert old format to new format if needed from a PyTorch state_dict
     # copy state_dict so _load_from_state_dict can modify it
@@ -624,29 +649,14 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                             " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
                             " those weights or else make sure your checkpoint file is correct."
                         )
-                    unexpected_keys = []
-
-                    empty_state_dict = model.state_dict()
-                    for param_name, param in state_dict.items():
-                        accepts_dtype = "dtype" in set(
-                            inspect.signature(set_module_tensor_to_device).parameters.keys()
-                        )
-
-                        if param_name not in empty_state_dict:
-                            unexpected_keys.append(param_name)
-                            continue
-
-                        if empty_state_dict[param_name].shape != param.shape:
-                            raise ValueError(
-                                f"Cannot load {pretrained_model_name_or_path} because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
-                            )
 
-                        if accepts_dtype:
-                            set_module_tensor_to_device(
-                                model, param_name, param_device, value=param, dtype=torch_dtype
-                            )
-                        else:
-                            set_module_tensor_to_device(model, param_name, param_device, value=param)
+                    unexpected_keys = load_model_dict_into_meta(
+                        model,
+                        state_dict,
+                        device=param_device,
+                        dtype=torch_dtype,
+                        model_name_or_path=pretrained_model_name_or_path,
+                    )
 
                     if cls._keys_to_ignore_on_load_unexpected is not None:
                         for pat in cls._keys_to_ignore_on_load_unexpected:

From 1f948109b80b8181b06aa6fc39f27a908b27322d Mon Sep 17 00:00:00 2001
From: dg845 <58458699+dg845@users.noreply.github.com>
Date: Tue, 12 Sep 2023 08:58:47 -0700
Subject: [PATCH 25/37] [docs] Fix
 DiffusionPipeline.enable_sequential_cpu_offload docstring (#4952)

* Fix an unmatched backtick and make description more general for DiffusionPipeline.enable_sequential_cpu_offload.

* make style

* _exclude_from_cpu_offload -> self._exclude_from_cpu_offload

* make style

* apply suggestions from review

* make style
---
 src/diffusers/pipelines/pipeline_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 71d32085afa4..5ead6ef810d9 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1293,10 +1293,10 @@ def maybe_free_model_hooks(self):
 
     def enable_sequential_cpu_offload(self, gpu_id: int = 0, device: Union[torch.device, str] = "cuda"):
         r"""
-        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
-        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
-        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        Offloads all models to CPU using 🤗 Accelerate, significantly reducing memory usage. When called, the state
+        dicts of all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are saved to CPU
+        and then moved to `torch.device('meta')` and loaded to GPU only when their specific submodule has its `forward`
+        method called. Offloading happens on a submodule basis. Memory savings are higher than with
         `enable_model_cpu_offload`, but performance is lower.
         """
         if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):

From 0e0db625d0b7da2e6c27336845514b4460bd0000 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 12 Sep 2023 18:56:35 +0200
Subject: [PATCH 26/37] Fix safety checker seq offload (#4998)

* fix safety checker

* fix safety checker

* fix safety checker
---
 src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py  | 1 +
 .../pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py    | 1 +
 src/diffusers/pipelines/controlnet/pipeline_controlnet.py        | 1 +
 .../pipelines/controlnet/pipeline_controlnet_img2img.py          | 1 +
 .../pipelines/controlnet/pipeline_controlnet_inpaint.py          | 1 +
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py      | 1 +
 .../pipeline_stable_diffusion_attend_and_excite.py               | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_diffedit.py       | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_gligen.py         | 1 +
 .../pipeline_stable_diffusion_gligen_text_image.py               | 1 +
 .../pipeline_stable_diffusion_image_variation.py                 | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py        | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_inpaint.py        | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py | 1 +
 .../pipeline_stable_diffusion_instruct_pix2pix.py                | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_k_diffusion.py    | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_ldm3d.py          | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_model_editing.py  | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_panorama.py       | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_paradigms.py      | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py   | 1 +
 .../pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py  | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_upscale.py        | 1 +
 23 files changed, 23 insertions(+)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index 9b56af541d8a..7b4ba394bb74 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -100,6 +100,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 0f01844395cf..00a5cb452a90 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -127,6 +127,7 @@ class AltDiffusionImg2ImgPipeline(
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
index 5c4e8fb0b555..c498f606d3d7 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -125,6 +125,7 @@ class StableDiffusionControlNetPipeline(
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
index abfa7225d15e..1126fa8b139e 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -149,6 +149,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index 2f046c137e3f..be3bfca96b32 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -273,6 +273,7 @@ class StableDiffusionControlNetInpaintPipeline(
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 2369a02a10f1..b637754c55a4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -101,6 +101,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
index e62d80a25c53..8bfcc7decb34 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
@@ -191,6 +191,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
index f641ea54e2d6..40d53d384bfd 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
@@ -272,6 +272,7 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor", "inverse_scheduler"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py
index 57ca7ccbcc18..f9b74fbf2eb7 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py
@@ -124,6 +124,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline):
     """
     _optional_components = ["safety_checker", "feature_extractor"]
     model_cpu_offload_seq = "text_encoder->unet->vae"
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
index e75c88f7e18f..a7cdb168e384 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
@@ -182,6 +182,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline):
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index 6caa87e369d6..b89f0bd9908c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -66,6 +66,7 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
     # we should give a descriptive message if the pipeline doesn't have one.
     _optional_components = ["safety_checker"]
     model_cpu_offload_seq = "image_encoder->unet->vae"
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 3f447ccad95d..6d01f2285af2 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -129,6 +129,7 @@ class StableDiffusionImg2ImgPipeline(
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index be52e27a0ec0..82d770e1d2e7 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -194,6 +194,7 @@ class StableDiffusionInpaintPipeline(
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 0fe05966f260..c25bf1c7be33 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -117,6 +117,7 @@ class StableDiffusionInpaintPipelineLegacy(
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index 4c8a7c6cc176..2628a165ebed 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -91,6 +91,7 @@ class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversion
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
index 6aca8a4feb8c..dc00f9fd4378 100755
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -82,6 +82,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
index ad1c39607672..217b2bb43032 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
@@ -114,6 +114,7 @@ class StableDiffusionLDM3DPipeline(
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
index 5ef8ba48d4f5..f8d6296ea943 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
@@ -68,6 +68,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
index 811cf69cabf5..aec93b56b6f8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
@@ -80,6 +80,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py
index 735951723dda..5152209f21aa 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py
@@ -95,6 +95,7 @@ class StableDiffusionParadigmsPipeline(
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index 5a55108575b4..47a16d6663cf 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -315,6 +315,7 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
         "caption_processor",
         "inverse_scheduler",
     ]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
index 204b5bd10986..84094f69b78c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
@@ -119,6 +119,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin)
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 855d178ae541..31982891cd01 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -92,6 +92,7 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
     """
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["watermarker", "safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
 
     def __init__(
         self,

From 6ea95b7a90c12c5b27773b440cf625243582203f Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Tue, 12 Sep 2023 10:36:38 -0700
Subject: [PATCH 27/37] Fix PR template (#4984)

fix template
---
 .github/PULL_REQUEST_TEMPLATE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 05c211645330..d8c6a821a3b8 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -41,7 +41,7 @@ Core library:
 - Schedulers: @williamberman and @patrickvonplaten
 - Pipelines:  @patrickvonplaten and @sayakpaul
 - Training examples: @sayakpaul and @patrickvonplaten
-- Docs: @stevenliu and @yiyixu
+- Docs: @stevhliu and @yiyixuxu
 - JAX and MPS: @pcuenca
 - Audio: @sanchit-gandhi
 - General functionalities: @patrickvonplaten and @sayakpaul

From 1037287e2bae30e33f844786f5e65ca5a69c41a1 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 12 Sep 2023 23:52:41 +0200
Subject: [PATCH 28/37] examples fix t2i training (#5001)

* examples fix t2i training

* make style
---
 examples/t2i_adapter/train_t2i_adapter_sdxl.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/t2i_adapter/train_t2i_adapter_sdxl.py b/examples/t2i_adapter/train_t2i_adapter_sdxl.py
index 3d846f42f649..d39081bbfc23 100644
--- a/examples/t2i_adapter/train_t2i_adapter_sdxl.py
+++ b/examples/t2i_adapter/train_t2i_adapter_sdxl.py
@@ -1060,7 +1060,9 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
     )
 
     # Prepare everything with our `accelerator`.
-    t2iadapter, optimizer, lr_scheduler = accelerator.prepare(t2iadapter, optimizer, lr_scheduler)
+    t2iadapter, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        t2iadapter, optimizer, train_dataloader, lr_scheduler
+    )
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)

From 8009272f48764a5ef3f90d7d400337f0b2e84f1d Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 13 Sep 2023 10:01:37 +0100
Subject: [PATCH 29/37] [Tests and Docs] Add a test on serializing pipelines
 with components containing fused LoRA modules (#4962)

* add: test to ensure pipelines can be saved with fused lora modules.

* add docs about serialization with fused lora.

* Apply suggestions from code review

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Empty-Commit

* Update docs/source/en/training/lora.md

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/training/lora.md  | 47 ++++++++++++++++++++++++++++--
 tests/models/test_lora_layers.py | 49 ++++++++++++++++++++++++++------
 2 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/training/lora.md b/docs/source/en/training/lora.md
index 80b4c58b8a88..dd7013c05932 100644
--- a/docs/source/en/training/lora.md
+++ b/docs/source/en/training/lora.md
@@ -34,7 +34,7 @@ the attention layers of a language model is sufficient to obtain good downstream
 
 [cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository. 🧨 Diffusers now supports finetuning with LoRA for [text-to-image generation](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image#training-with-lora) and [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth#training-with-low-rank-adaptation-of-large-language-models-lora). This guide will show you how to do both.
 
-If you'd like to store or share your model with the community, login to your Hugging Face account (create [one](hf.co/join) if you don't have one already):
+If you'd like to store or share your model with the community, login to your Hugging Face account (create [one](https://hf.co/join) if you don't have one already):
 
 ```bash
 huggingface-cli login
@@ -321,7 +321,7 @@ pipe.fuse_lora()
 
 generator = torch.manual_seed(0)
 images_fusion = pipe(
-    "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+    "masterpiece, best quality, mountain", generator=generator, num_inference_steps=2
 ).images
 
 # To work with a different `lora_scale`, first reverse the effects of `fuse_lora()`.
@@ -333,7 +333,48 @@ pipe.fuse_lora(lora_scale=0.5)
 
 generator = torch.manual_seed(0)
 images_fusion = pipe(
-    "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+    "masterpiece, best quality, mountain", generator=generator, num_inference_steps=2
+).images
+```
+
+## Serializing pipelines with fused LoRA parameters
+
+Let's say you want to load the pipeline above that has its UNet fused with the LoRA parameters. You can easily do so by simply calling the `save_pretrained()` method on `pipe`. 
+
+After loading the LoRA parameters into a pipeline, if you want to serialize the pipeline such that the affected model components are already fused with the LoRA parameters, you should:
+
+* call `fuse_lora()` on the pipeline with the desired `lora_scale`, given you've already loaded the LoRA parameters into it.
+* call `save_pretrained()` on the pipeline. 
+
+Here is a complete example:
+
+```python
+from diffusers import DiffusionPipeline
+import torch 
+
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
+lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+# First, fuse the LoRA parameters.
+pipe.fuse_lora()
+
+# Then save.
+pipe.save_pretrained("my-pipeline-with-fused-lora")
+```
+
+Now, you can load the pipeline and directly perform inference without having to load the LoRA parameters again:
+
+```python
+from diffusers import DiffusionPipeline
+import torch 
+
+pipe = DiffusionPipeline.from_pretrained("my-pipeline-with-fused-lora", torch_dtype=torch.float16).to("cuda")
+
+generator = torch.manual_seed(0)
+images_fusion = pipe(
+    "masterpiece, best quality, mountain", generator=generator, num_inference_steps=2
 ).images
 ```
 
diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py
index 1d846b6cdb3f..9affb37aa5d6 100644
--- a/tests/models/test_lora_layers.py
+++ b/tests/models/test_lora_layers.py
@@ -965,15 +965,11 @@ def test_with_different_scales_fusion_equivalence(self):
         pipeline_components, lora_components = self.get_dummy_components()
         sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
         sd_pipe = sd_pipe.to(torch_device)
-        # sd_pipe.unet.set_default_attn_processor()
         sd_pipe.set_progress_bar_config(disable=None)
 
         _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
 
-        images = sd_pipe(
-            **pipeline_inputs,
-            generator=torch.manual_seed(0),
-        ).images
+        images = sd_pipe(**pipeline_inputs, generator=torch.manual_seed(0)).images
         images_slice = images[0, -3:, -3:, -1]
 
         # Emulate training.
@@ -993,9 +989,7 @@ def test_with_different_scales_fusion_equivalence(self):
             sd_pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))
 
         lora_images_scale_0_5 = sd_pipe(
-            **pipeline_inputs,
-            generator=torch.manual_seed(0),
-            cross_attention_kwargs={"scale": 0.5},
+            **pipeline_inputs, generator=torch.manual_seed(0), cross_attention_kwargs={"scale": 0.5}
         ).images
         lora_image_slice_scale_0_5 = lora_images_scale_0_5[0, -3:, -3:, -1]
 
@@ -1017,6 +1011,45 @@ def test_with_different_scales_fusion_equivalence(self):
             images_slice, lora_image_slice_scale_0_5, atol=1e-03
         ), "0.5 scale and no scale shouldn't match"
 
+    def test_save_load_fused_lora_modules(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
+
+        # Emulate training.
+        set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True, var=0.1)
+        set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True, var=0.1)
+        set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True, var=0.1)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            StableDiffusionXLPipeline.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+                safe_serialization=True,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
+            sd_pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))
+
+        sd_pipe.fuse_lora()
+        lora_images_fusion = sd_pipe(**pipeline_inputs, generator=torch.manual_seed(0)).images
+        lora_image_slice_fusion = lora_images_fusion[0, -3:, -3:, -1]
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sd_pipe.save_pretrained(tmpdirname)
+            sd_pipe_loaded = StableDiffusionXLPipeline.from_pretrained(tmpdirname)
+
+        loaded_lora_images = sd_pipe_loaded(**pipeline_inputs, generator=torch.manual_seed(0)).images
+        loaded_lora_image_slice = loaded_lora_images[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            lora_image_slice_fusion, loaded_lora_image_slice, atol=1e-03
+        ), "The pipeline was serialized with LoRA parameters fused inside of the respected modules. The loaded pipeline should yield proper outputs, henceforth."
+
 
 @slow
 @require_torch_gpu

From 324aef6d148bdd260ac8e1a1c29571aed4bdc62f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 13 Sep 2023 11:05:20 +0200
Subject: [PATCH 30/37] [SDXL] Add LoRA to all pipelines (#4896)

* [SDXL] Add LoRA to all pipelines

* fix all

* fix all

* fix all

* fix more docs

* make style
---
 docs/source/en/api/loaders.md                 |   4 +
 src/diffusers/loaders.py                      | 149 ++++++++++++++++++
 .../pipeline_controlnet_inpaint_sd_xl.py      | 120 +-------------
 .../controlnet/pipeline_controlnet_sd_xl.py   | 116 +-------------
 .../pipeline_controlnet_sd_xl_img2img.py      |  10 +-
 .../pipeline_stable_diffusion_xl.py           | 117 +-------------
 .../pipeline_stable_diffusion_xl_img2img.py   | 118 +-------------
 .../pipeline_stable_diffusion_xl_inpaint.py   | 118 +-------------
 ...ne_stable_diffusion_xl_instruct_pix2pix.py |  18 ++-
 .../pipeline_stable_diffusion_xl_adapter.py   |   6 +-
 10 files changed, 203 insertions(+), 573 deletions(-)

diff --git a/docs/source/en/api/loaders.md b/docs/source/en/api/loaders.md
index 98aaea006088..5c7c3ef660ca 100644
--- a/docs/source/en/api/loaders.md
+++ b/docs/source/en/api/loaders.md
@@ -28,6 +28,10 @@ Adapters (textual inversion, LoRA, hypernetworks) allow you to modify a diffusio
 
 [[autodoc]] loaders.TextualInversionLoaderMixin
 
+## StableDiffusionXLLoraLoaderMixin
+
+[[autodoc]] loaders.StableDiffusionXLLoraLoaderMixin
+
 ## LoraLoaderMixin
 
 [[autodoc]] loaders.LoraLoaderMixin
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 16eabb0077bb..51814a611a00 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -33,6 +33,7 @@
     _get_model_file,
     deprecate,
     is_accelerate_available,
+    is_accelerate_version,
     is_omegaconf_available,
     is_transformers_available,
     logging,
@@ -2556,3 +2557,151 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             controlnet.to(torch_dtype=torch_dtype)
 
         return controlnet
+
+
+class StableDiffusionXLLoraLoaderMixin(LoraLoaderMixin):
+    """This class overrides `LoraLoaderMixin` with LoRA loading/saving code that's specific to SDXL"""
+
+    # Overrride to properly handle the loading and unloading of the additional text encoder.
+    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
+        """
+        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and
+        `self.text_encoder`.
+
+        All kwargs are forwarded to `self.lora_state_dict`.
+
+        See [`~loaders.LoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
+
+        See [`~loaders.LoraLoaderMixin.load_lora_into_unet`] for more details on how the state dict is loaded into
+        `self.unet`.
+
+        See [`~loaders.LoraLoaderMixin.load_lora_into_text_encoder`] for more details on how the state dict is loaded
+        into `self.text_encoder`.
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                See [`~loaders.LoraLoaderMixin.lora_state_dict`].
+            kwargs (`dict`, *optional*):
+                See [`~loaders.LoraLoaderMixin.lora_state_dict`].
+        """
+        # We could have accessed the unet config from `lora_state_dict()` too. We pass
+        # it here explicitly to be able to tell that it's coming from an SDXL
+        # pipeline.
+
+        # Remove any existing hooks.
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+        else:
+            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
+
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        recursive = False
+        for _, component in self.components.items():
+            if isinstance(component, torch.nn.Module):
+                if hasattr(component, "_hf_hook"):
+                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                    )
+                    recursive = is_sequential_cpu_offload
+                    remove_hook_from_module(component, recurse=recursive)
+        state_dict, network_alphas = self.lora_state_dict(
+            pretrained_model_name_or_path_or_dict,
+            unet_config=self.unet.config,
+            **kwargs,
+        )
+        self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
+
+        text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
+        if len(text_encoder_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_state_dict,
+                network_alphas=network_alphas,
+                text_encoder=self.text_encoder,
+                prefix="text_encoder",
+                lora_scale=self.lora_scale,
+            )
+
+        text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
+        if len(text_encoder_2_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_2_state_dict,
+                network_alphas=network_alphas,
+                text_encoder=self.text_encoder_2,
+                prefix="text_encoder_2",
+                lora_scale=self.lora_scale,
+            )
+
+        # Offload back.
+        if is_model_cpu_offload:
+            self.enable_model_cpu_offload()
+        elif is_sequential_cpu_offload:
+            self.enable_sequential_cpu_offload()
+
+    @classmethod
+    def save_lora_weights(
+        self,
+        save_directory: Union[str, os.PathLike],
+        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+    ):
+        r"""
+        Save the LoRA parameters corresponding to the UNet and text encoder.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save LoRA parameters to. Will be created if it doesn't exist.
+            unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `unet`.
+            text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text
+                encoder LoRA state dict because it comes from 🤗 Transformers.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+        """
+        state_dict = {}
+
+        def pack_weights(layers, prefix):
+            layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
+            layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
+            return layers_state_dict
+
+        if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers):
+            raise ValueError(
+                "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`."
+            )
+
+        if unet_lora_layers:
+            state_dict.update(pack_weights(unet_lora_layers, "unet"))
+
+        if text_encoder_lora_layers and text_encoder_2_lora_layers:
+            state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
+            state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
+
+        self.write_lora_layers(
+            state_dict=state_dict,
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+
+    def _remove_text_encoder_monkey_patch(self):
+        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
+        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
index 6595d8f4566d..cb4ec2f25bd3 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import inspect
-import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -25,7 +24,7 @@
 from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
@@ -36,8 +35,6 @@
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     is_invisible_watermark_available,
     logging,
     replace_example_docstring,
@@ -128,7 +125,9 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
     return noise_cfg
 
 
-class StableDiffusionXLControlNetInpaintPipeline(DiffusionPipeline, LoraLoaderMixin, FromSingleFileMixin):
+class StableDiffusionXLControlNetInpaintPipeline(
+    DiffusionPipeline, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin
+):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion XL.
 
@@ -136,11 +135,11 @@ class StableDiffusionXLControlNetInpaintPipeline(DiffusionPipeline, LoraLoaderMi
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
     In addition the pipeline inherits the following loading methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
         - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
 
     as well as the following saving methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
 
     Args:
         vae ([`AutoencoderKL`]):
@@ -308,7 +307,7 @@ def encode_prompt(
 
         # set lora scale so that monkey patched LoRA
         # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
             self._lora_scale = lora_scale
 
             # dynamically adjust the LoRA scale
@@ -1510,108 +1509,3 @@ def denoising_value_valid(dnv):
             return (image,)
 
         return StableDiffusionXLPipelineOutput(images=image)
-
-    # Overrride to properly handle the loading and unloading of the additional text encoder.
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.load_lora_weights
-    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
-        # We could have accessed the unet config from `lora_state_dict()` too. We pass
-        # it here explicitly to be able to tell that it's coming from an SDXL
-        # pipeline.
-
-        # Remove any existing hooks.
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
-        else:
-            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
-
-        is_model_cpu_offload = False
-        is_sequential_cpu_offload = False
-        recursive = False
-        for _, component in self.components.items():
-            if isinstance(component, torch.nn.Module):
-                if hasattr(component, "_hf_hook"):
-                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
-                    logger.info(
-                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
-                    )
-                    recursive = is_sequential_cpu_offload
-                    remove_hook_from_module(component, recurse=recursive)
-        state_dict, network_alphas = self.lora_state_dict(
-            pretrained_model_name_or_path_or_dict,
-            unet_config=self.unet.config,
-            **kwargs,
-        )
-        self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
-
-        text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
-        if len(text_encoder_state_dict) > 0:
-            self.load_lora_into_text_encoder(
-                text_encoder_state_dict,
-                network_alphas=network_alphas,
-                text_encoder=self.text_encoder,
-                prefix="text_encoder",
-                lora_scale=self.lora_scale,
-            )
-
-        text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
-        if len(text_encoder_2_state_dict) > 0:
-            self.load_lora_into_text_encoder(
-                text_encoder_2_state_dict,
-                network_alphas=network_alphas,
-                text_encoder=self.text_encoder_2,
-                prefix="text_encoder_2",
-                lora_scale=self.lora_scale,
-            )
-
-        # Offload back.
-        if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
-        elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
-
-    @classmethod
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.save_lora_weights
-    def save_lora_weights(
-        self,
-        save_directory: Union[str, os.PathLike],
-        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        is_main_process: bool = True,
-        weight_name: str = None,
-        save_function: Callable = None,
-        safe_serialization: bool = True,
-    ):
-        state_dict = {}
-
-        def pack_weights(layers, prefix):
-            layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
-            layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
-            return layers_state_dict
-
-        if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers):
-            raise ValueError(
-                "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`."
-            )
-
-        if unet_lora_layers:
-            state_dict.update(pack_weights(unet_lora_layers, "unet"))
-
-        if text_encoder_lora_layers and text_encoder_2_lora_layers:
-            state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
-            state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
-
-        self.write_lora_layers(
-            state_dict=state_dict,
-            save_directory=save_directory,
-            is_main_process=is_main_process,
-            weight_name=weight_name,
-            save_function=save_function,
-            safe_serialization=safe_serialization,
-        )
-
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._remove_text_encoder_monkey_patch
-    def _remove_text_encoder_monkey_patch(self):
-        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
-        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
index e2f463329c3d..cbb78e509b84 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -14,7 +14,6 @@
 
 
 import inspect
-import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -26,7 +25,7 @@
 from diffusers.utils.import_utils import is_invisible_watermark_available
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
@@ -37,8 +36,6 @@
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -103,7 +100,7 @@
 
 
 class StableDiffusionXLControlNetPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet guidance.
@@ -113,7 +110,7 @@ class StableDiffusionXLControlNetPipeline(
 
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
 
     Args:
@@ -283,7 +280,7 @@ def encode_prompt(
 
         # set lora scale so that monkey patched LoRA
         # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
             self._lora_scale = lora_scale
 
             # dynamically adjust the LoRA scale
@@ -1176,108 +1173,3 @@ def __call__(
             return (image,)
 
         return StableDiffusionXLPipelineOutput(images=image)
-
-    # Overrride to properly handle the loading and unloading of the additional text encoder.
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.load_lora_weights
-    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
-        # We could have accessed the unet config from `lora_state_dict()` too. We pass
-        # it here explicitly to be able to tell that it's coming from an SDXL
-        # pipeline.
-
-        # Remove any existing hooks.
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
-        else:
-            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
-
-        is_model_cpu_offload = False
-        is_sequential_cpu_offload = False
-        recursive = False
-        for _, component in self.components.items():
-            if isinstance(component, torch.nn.Module):
-                if hasattr(component, "_hf_hook"):
-                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
-                    logger.info(
-                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
-                    )
-                    recursive = is_sequential_cpu_offload
-                    remove_hook_from_module(component, recurse=recursive)
-        state_dict, network_alphas = self.lora_state_dict(
-            pretrained_model_name_or_path_or_dict,
-            unet_config=self.unet.config,
-            **kwargs,
-        )
-        self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
-
-        text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
-        if len(text_encoder_state_dict) > 0:
-            self.load_lora_into_text_encoder(
-                text_encoder_state_dict,
-                network_alphas=network_alphas,
-                text_encoder=self.text_encoder,
-                prefix="text_encoder",
-                lora_scale=self.lora_scale,
-            )
-
-        text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
-        if len(text_encoder_2_state_dict) > 0:
-            self.load_lora_into_text_encoder(
-                text_encoder_2_state_dict,
-                network_alphas=network_alphas,
-                text_encoder=self.text_encoder_2,
-                prefix="text_encoder_2",
-                lora_scale=self.lora_scale,
-            )
-
-        # Offload back.
-        if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
-        elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
-
-    @classmethod
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.save_lora_weights
-    def save_lora_weights(
-        self,
-        save_directory: Union[str, os.PathLike],
-        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        is_main_process: bool = True,
-        weight_name: str = None,
-        save_function: Callable = None,
-        safe_serialization: bool = True,
-    ):
-        state_dict = {}
-
-        def pack_weights(layers, prefix):
-            layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
-            layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
-            return layers_state_dict
-
-        if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers):
-            raise ValueError(
-                "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`."
-            )
-
-        if unet_lora_layers:
-            state_dict.update(pack_weights(unet_lora_layers, "unet"))
-
-        if text_encoder_lora_layers and text_encoder_2_lora_layers:
-            state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
-            state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
-
-        self.write_lora_layers(
-            state_dict=state_dict,
-            save_directory=save_directory,
-            is_main_process=is_main_process,
-            weight_name=weight_name,
-            save_function=save_function,
-            safe_serialization=safe_serialization,
-        )
-
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._remove_text_encoder_monkey_patch
-    def _remove_text_encoder_monkey_patch(self):
-        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
-        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
index 8337b704450b..6fe3d0c641e5 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -25,7 +25,7 @@
 from diffusers.utils.import_utils import is_invisible_watermark_available
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
@@ -128,7 +128,9 @@
 """
 
 
-class StableDiffusionXLControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableDiffusionXLControlNetImg2ImgPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin
+):
     r"""
     Pipeline for image-to-image generation using Stable Diffusion XL with ControlNet guidance.
 
@@ -137,7 +139,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(DiffusionPipeline, TextualInver
 
     In addition the pipeline inherits the following loading methods:
         - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
-        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
 
     Args:
         vae ([`AutoencoderKL`]):
@@ -316,7 +318,7 @@ def encode_prompt(
 
         # set lora scale so that monkey patched LoRA
         # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
             self._lora_scale = lora_scale
 
             # dynamically adjust the LoRA scale
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 84fc9c7c5788..40119a6087d2 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import inspect
-import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -22,7 +21,7 @@
 from ...image_processor import VaeImageProcessor
 from ...loaders import (
     FromSingleFileMixin,
-    LoraLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, UNet2DConditionModel
@@ -35,8 +34,6 @@
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     is_invisible_watermark_available,
     logging,
     replace_example_docstring,
@@ -84,7 +81,9 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
     return noise_cfg
 
 
-class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin):
+class StableDiffusionXLPipeline(
+    DiffusionPipeline, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion XL.
 
@@ -92,11 +91,11 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoad
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
     In addition the pipeline inherits the following loading methods:
-        - *LoRA*: [`StableDiffusionXLPipeline.load_lora_weights`]
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
         - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
 
     as well as the following saving methods:
-        - *LoRA*: [`loaders.StableDiffusionXLPipeline.save_lora_weights`]
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
 
     Args:
         vae ([`AutoencoderKL`]):
@@ -257,7 +256,7 @@ def encode_prompt(
 
         # set lora scale so that monkey patched LoRA
         # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
             self._lora_scale = lora_scale
 
             # dynamically adjust the LoRA scale
@@ -886,105 +885,3 @@ def __call__(
             return (image,)
 
         return StableDiffusionXLPipelineOutput(images=image)
-
-    # Overrride to properly handle the loading and unloading of the additional text encoder.
-    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
-        # We could have accessed the unet config from `lora_state_dict()` too. We pass
-        # it here explicitly to be able to tell that it's coming from an SDXL
-        # pipeline.
-
-        # Remove any existing hooks.
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
-        else:
-            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
-
-        is_model_cpu_offload = False
-        is_sequential_cpu_offload = False
-        recursive = False
-        for _, component in self.components.items():
-            if isinstance(component, torch.nn.Module):
-                if hasattr(component, "_hf_hook"):
-                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
-                    logger.info(
-                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
-                    )
-                    recursive = is_sequential_cpu_offload
-                    remove_hook_from_module(component, recurse=recursive)
-        state_dict, network_alphas = self.lora_state_dict(
-            pretrained_model_name_or_path_or_dict,
-            unet_config=self.unet.config,
-            **kwargs,
-        )
-        self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
-
-        text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
-        if len(text_encoder_state_dict) > 0:
-            self.load_lora_into_text_encoder(
-                text_encoder_state_dict,
-                network_alphas=network_alphas,
-                text_encoder=self.text_encoder,
-                prefix="text_encoder",
-                lora_scale=self.lora_scale,
-            )
-
-        text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
-        if len(text_encoder_2_state_dict) > 0:
-            self.load_lora_into_text_encoder(
-                text_encoder_2_state_dict,
-                network_alphas=network_alphas,
-                text_encoder=self.text_encoder_2,
-                prefix="text_encoder_2",
-                lora_scale=self.lora_scale,
-            )
-
-        # Offload back.
-        if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
-        elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
-
-    @classmethod
-    def save_lora_weights(
-        self,
-        save_directory: Union[str, os.PathLike],
-        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        is_main_process: bool = True,
-        weight_name: str = None,
-        save_function: Callable = None,
-        safe_serialization: bool = True,
-    ):
-        state_dict = {}
-
-        def pack_weights(layers, prefix):
-            layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
-            layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
-            return layers_state_dict
-
-        if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers):
-            raise ValueError(
-                "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`."
-            )
-
-        if unet_lora_layers:
-            state_dict.update(pack_weights(unet_lora_layers, "unet"))
-
-        if text_encoder_lora_layers and text_encoder_2_lora_layers:
-            state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
-            state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
-
-        self.write_lora_layers(
-            state_dict=state_dict,
-            save_directory=save_directory,
-            is_main_process=is_main_process,
-            weight_name=weight_name,
-            save_function=save_function,
-            safe_serialization=safe_serialization,
-        )
-
-    def _remove_text_encoder_monkey_patch(self):
-        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
-        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 4b66193f75a9..162fc828fff4 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import inspect
-import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import PIL.Image
@@ -21,7 +20,7 @@
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
@@ -32,8 +31,6 @@
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
     is_invisible_watermark_available,
     logging,
     replace_example_docstring,
@@ -85,7 +82,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
 
 
 class StableDiffusionXLImg2ImgPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -94,11 +91,11 @@ class StableDiffusionXLImg2ImgPipeline(
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
     In addition the pipeline inherits the following loading methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
         - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
 
     as well as the following saving methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
 
     Args:
         vae ([`AutoencoderKL`]):
@@ -266,7 +263,7 @@ def encode_prompt(
 
         # set lora scale so that monkey patched LoRA
         # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
             self._lora_scale = lora_scale
 
             # dynamically adjust the LoRA scale
@@ -1036,108 +1033,3 @@ def denoising_value_valid(dnv):
             return (image,)
 
         return StableDiffusionXLPipelineOutput(images=image)
-
-    # Overrride to properly handle the loading and unloading of the additional text encoder.
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.load_lora_weights
-    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
-        # We could have accessed the unet config from `lora_state_dict()` too. We pass
-        # it here explicitly to be able to tell that it's coming from an SDXL
-        # pipeline.
-
-        # Remove any existing hooks.
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
-        else:
-            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
-
-        is_model_cpu_offload = False
-        is_sequential_cpu_offload = False
-        recursive = False
-        for _, component in self.components.items():
-            if isinstance(component, torch.nn.Module):
-                if hasattr(component, "_hf_hook"):
-                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
-                    logger.info(
-                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
-                    )
-                    recursive = is_sequential_cpu_offload
-                    remove_hook_from_module(component, recurse=recursive)
-        state_dict, network_alphas = self.lora_state_dict(
-            pretrained_model_name_or_path_or_dict,
-            unet_config=self.unet.config,
-            **kwargs,
-        )
-        self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
-
-        text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
-        if len(text_encoder_state_dict) > 0:
-            self.load_lora_into_text_encoder(
-                text_encoder_state_dict,
-                network_alphas=network_alphas,
-                text_encoder=self.text_encoder,
-                prefix="text_encoder",
-                lora_scale=self.lora_scale,
-            )
-
-        text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
-        if len(text_encoder_2_state_dict) > 0:
-            self.load_lora_into_text_encoder(
-                text_encoder_2_state_dict,
-                network_alphas=network_alphas,
-                text_encoder=self.text_encoder_2,
-                prefix="text_encoder_2",
-                lora_scale=self.lora_scale,
-            )
-
-        # Offload back.
-        if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
-        elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
-
-    @classmethod
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.save_lora_weights
-    def save_lora_weights(
-        self,
-        save_directory: Union[str, os.PathLike],
-        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        is_main_process: bool = True,
-        weight_name: str = None,
-        save_function: Callable = None,
-        safe_serialization: bool = True,
-    ):
-        state_dict = {}
-
-        def pack_weights(layers, prefix):
-            layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
-            layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
-            return layers_state_dict
-
-        if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers):
-            raise ValueError(
-                "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`."
-            )
-
-        if unet_lora_layers:
-            state_dict.update(pack_weights(unet_lora_layers, "unet"))
-
-        if text_encoder_lora_layers and text_encoder_2_lora_layers:
-            state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
-            state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
-
-        self.write_lora_layers(
-            state_dict=state_dict,
-            save_directory=save_directory,
-            is_main_process=is_main_process,
-            weight_name=weight_name,
-            save_function=save_function,
-            safe_serialization=safe_serialization,
-        )
-
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._remove_text_encoder_monkey_patch
-    def _remove_text_encoder_monkey_patch(self):
-        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
-        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 55baada04294..25753859c210 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import inspect
-import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -22,7 +21,7 @@
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
@@ -34,8 +33,6 @@
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
     is_invisible_watermark_available,
     logging,
     replace_example_docstring,
@@ -231,7 +228,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
 
 
 class StableDiffusionXLInpaintPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -240,11 +237,11 @@ class StableDiffusionXLInpaintPipeline(
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
     In addition the pipeline inherits the following loading methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
         - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
 
     as well as the following saving methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
 
     Args:
         vae ([`AutoencoderKL`]):
@@ -415,7 +412,7 @@ def encode_prompt(
 
         # set lora scale so that monkey patched LoRA
         # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
             self._lora_scale = lora_scale
 
             # dynamically adjust the LoRA scale
@@ -1355,108 +1352,3 @@ def denoising_value_valid(dnv):
             return (image,)
 
         return StableDiffusionXLPipelineOutput(images=image)
-
-    # Overrride to properly handle the loading and unloading of the additional text encoder.
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.load_lora_weights
-    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
-        # We could have accessed the unet config from `lora_state_dict()` too. We pass
-        # it here explicitly to be able to tell that it's coming from an SDXL
-        # pipeline.
-
-        # Remove any existing hooks.
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
-        else:
-            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
-
-        is_model_cpu_offload = False
-        is_sequential_cpu_offload = False
-        recursive = False
-        for _, component in self.components.items():
-            if isinstance(component, torch.nn.Module):
-                if hasattr(component, "_hf_hook"):
-                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
-                    logger.info(
-                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
-                    )
-                    recursive = is_sequential_cpu_offload
-                    remove_hook_from_module(component, recurse=recursive)
-        state_dict, network_alphas = self.lora_state_dict(
-            pretrained_model_name_or_path_or_dict,
-            unet_config=self.unet.config,
-            **kwargs,
-        )
-        self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
-
-        text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
-        if len(text_encoder_state_dict) > 0:
-            self.load_lora_into_text_encoder(
-                text_encoder_state_dict,
-                network_alphas=network_alphas,
-                text_encoder=self.text_encoder,
-                prefix="text_encoder",
-                lora_scale=self.lora_scale,
-            )
-
-        text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
-        if len(text_encoder_2_state_dict) > 0:
-            self.load_lora_into_text_encoder(
-                text_encoder_2_state_dict,
-                network_alphas=network_alphas,
-                text_encoder=self.text_encoder_2,
-                prefix="text_encoder_2",
-                lora_scale=self.lora_scale,
-            )
-
-        # Offload back.
-        if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
-        elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
-
-    @classmethod
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.save_lora_weights
-    def save_lora_weights(
-        self,
-        save_directory: Union[str, os.PathLike],
-        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        is_main_process: bool = True,
-        weight_name: str = None,
-        save_function: Callable = None,
-        safe_serialization: bool = True,
-    ):
-        state_dict = {}
-
-        def pack_weights(layers, prefix):
-            layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
-            layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
-            return layers_state_dict
-
-        if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers):
-            raise ValueError(
-                "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`."
-            )
-
-        if unet_lora_layers:
-            state_dict.update(pack_weights(unet_lora_layers, "unet"))
-
-        if text_encoder_lora_layers and text_encoder_2_lora_layers:
-            state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
-            state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
-
-        self.write_lora_layers(
-            state_dict=state_dict,
-            save_directory=save_directory,
-            is_main_process=is_main_process,
-            weight_name=weight_name,
-            save_function=save_function,
-            safe_serialization=safe_serialization,
-        )
-
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._remove_text_encoder_monkey_patch
-    def _remove_text_encoder_monkey_patch(self):
-        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
-        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
index 786231dd5c15..0f951c78cb23 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -20,7 +20,7 @@
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
@@ -93,7 +93,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
 
 
 class StableDiffusionXLInstructPix2PixPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin
 ):
     r"""
     Pipeline for pixel-level image editing by following text instructions. Based on Stable Diffusion XL.
@@ -102,10 +102,10 @@ class StableDiffusionXLInstructPix2PixPipeline(
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
     In addition the pipeline inherits the following loading methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
 
     as well as the following saving methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
 
     Args:
         vae ([`AutoencoderKL`]):
@@ -268,7 +268,7 @@ def encode_prompt(
 
         # set lora scale so that monkey patched LoRA
         # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
             self._lora_scale = lora_scale
 
             # dynamically adjust the LoRA scale
@@ -710,6 +710,14 @@ def __call__(
                 For most cases, `target_size` should be set to the desired height and width of the generated image. If
                 not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
                 section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
 
         Examples:
 
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
index d7441db70741..6019d93fe02d 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -23,7 +23,7 @@
 from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
 
 from ...image_processor import VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
@@ -122,7 +122,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
 
 
 class StableDiffusionXLAdapterPipeline(
-    DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+    DiffusionPipeline, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
@@ -280,7 +280,7 @@ def encode_prompt(
 
         # set lora scale so that monkey patched LoRA
         # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
             self._lora_scale = lora_scale
 
             # dynamically adjust the LoRA scale

From b47f5115da50769e554501560540969e02585adc Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 13 Sep 2023 11:21:04 +0200
Subject: [PATCH 31/37] [Lora] fix lora fuse unfuse (#5003)

* fix lora fuse unfuse

* add same changes to loaders.py

* add test

---------

Co-authored-by: multimodalart <joaopaulo.passos+multimodal@gmail.com>
---
 src/diffusers/loaders.py         |  2 +-
 src/diffusers/models/lora.py     |  4 ++--
 tests/models/test_lora_layers.py | 40 +++++++++++++++++++++++++++++++-
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 51814a611a00..2bec9a9ab349 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -121,7 +121,7 @@ def _fuse_lora(self, lora_scale=1.0):
         self.lora_scale = lora_scale
 
     def _unfuse_lora(self):
-        if not (hasattr(self, "w_up") and hasattr(self, "w_down")):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
             return
 
         fused_weight = self.regular_linear_layer.weight.data
diff --git a/src/diffusers/models/lora.py b/src/diffusers/models/lora.py
index 834a7051b06d..cc8e3e231e2b 100644
--- a/src/diffusers/models/lora.py
+++ b/src/diffusers/models/lora.py
@@ -139,7 +139,7 @@ def _fuse_lora(self, lora_scale=1.0):
         self._lora_scale = lora_scale
 
     def _unfuse_lora(self):
-        if not (hasattr(self, "w_up") and hasattr(self, "w_down")):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
             return
 
         fused_weight = self.weight.data
@@ -204,7 +204,7 @@ def _fuse_lora(self, lora_scale=1.0):
         self._lora_scale = lora_scale
 
     def _unfuse_lora(self):
-        if not (hasattr(self, "w_up") and hasattr(self, "w_down")):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
             return
 
         fused_weight = self.weight.data
diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py
index 9affb37aa5d6..ef6ade9af5c1 100644
--- a/tests/models/test_lora_layers.py
+++ b/tests/models/test_lora_layers.py
@@ -43,7 +43,7 @@
     LoRAAttnProcessor2_0,
     XFormersAttnProcessor,
 )
-from diffusers.utils.testing_utils import floats_tensor, require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import floats_tensor, nightly, require_torch_gpu, slow, torch_device
 
 
 def create_unet_lora_layers(unet: nn.Module):
@@ -1497,3 +1497,41 @@ def test_sdxl_1_0_lora_with_sequential_cpu_offloading(self):
         expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
 
         self.assertTrue(np.allclose(images, expected, atol=1e-3))
+
+    @nightly
+    def test_sequential_fuse_unfuse(self):
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+
+        # 1. round
+        pipe.load_lora_weights("Pclanglais/TintinIA")
+        pipe.fuse_lora()
+
+        generator = torch.Generator().manual_seed(0)
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+        image_slice = images[0, -3:, -3:, -1].flatten()
+
+        pipe.unfuse_lora()
+
+        # 2. round
+        pipe.load_lora_weights("ProomptEngineer/pe-balloon-diffusion-style")
+        pipe.fuse_lora()
+        pipe.unfuse_lora()
+
+        # 3. round
+        pipe.load_lora_weights("ostris/crayon_style_lora_sdxl")
+        pipe.fuse_lora()
+        pipe.unfuse_lora()
+
+        # 4. back to 1st round
+        pipe.load_lora_weights("Pclanglais/TintinIA")
+        pipe.fuse_lora()
+
+        generator = torch.Generator().manual_seed(0)
+        images_2 = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+        image_slice_2 = images_2[0, -3:, -3:, -1].flatten()
+
+        self.assertTrue(np.allclose(image_slice, image_slice_2, atol=1e-3))

From 34bfe98eafa38215e6e86a9d3622c496fab7ffe9 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 13 Sep 2023 14:53:59 +0530
Subject: [PATCH 32/37] Gligen Text to Image fix (#5010)

* fix gligen clip import issue

* fix dtype issue with gligen text to image pipeline

* make fix copies
---
 src/diffusers/__init__.py                         |  2 ++
 src/diffusers/pipelines/__init__.py               |  2 ++
 ...pipeline_stable_diffusion_gligen_text_image.py |  2 ++
 .../utils/dummy_torch_and_transformers_objects.py | 15 +++++++++++++++
 4 files changed, 21 insertions(+)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 9c1d1fe2e757..f504b0c048db 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -197,6 +197,7 @@
             "AudioLDM2ProjectionModel",
             "AudioLDM2UNet2DConditionModel",
             "AudioLDMPipeline",
+            "CLIPImageProjection",
             "CycleDiffusionPipeline",
             "IFImg2ImgPipeline",
             "IFImg2ImgSuperResolutionPipeline",
@@ -530,6 +531,7 @@
             AudioLDM2ProjectionModel,
             AudioLDM2UNet2DConditionModel,
             AudioLDMPipeline,
+            CLIPImageProjection,
             CycleDiffusionPipeline,
             IFImg2ImgPipeline,
             IFImg2ImgSuperResolutionPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 8bf0a98de893..ef3566cf61c0 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -113,6 +113,7 @@
     _import_structure["shap_e"] = ["ShapEImg2ImgPipeline", "ShapEPipeline"]
     _import_structure["stable_diffusion"].extend(
         [
+            "CLIPImageProjection",
             "CycleDiffusionPipeline",
             "StableDiffusionAttendAndExcitePipeline",
             "StableDiffusionDepth2ImgPipeline",
@@ -323,6 +324,7 @@
         from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
         from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
         from .stable_diffusion import (
+            CLIPImageProjection,
             CycleDiffusionPipeline,
             StableDiffusionAttendAndExcitePipeline,
             StableDiffusionDepth2ImgPipeline,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
index a7cdb168e384..65cb9d284552 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
@@ -582,6 +582,8 @@ def get_clip_feature(self, input, normalize_constant, device, is_image=False):
             if input is None:
                 return None
             inputs = self.processor(images=[input], return_tensors="pt").to(device)
+            inputs["pixel_values"] = inputs["pixel_values"].to(self.image_encoder.dtype)
+
             outputs = self.image_encoder(**inputs)
             feature = outputs.image_embeds
             feature = self.image_project(feature).squeeze(0)
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 5a123c1cd1ee..d831cc49b495 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -92,6 +92,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class CLIPImageProjection(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class CycleDiffusionPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From 6d6a08f1f1a62c90e501f1b2771af94fe2ac0678 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 13 Sep 2023 11:29:13 +0200
Subject: [PATCH 33/37] [Flax->PT] Fix flaky testing (#5011)

fix flaky flax class name
---
 src/diffusers/pipelines/pipeline_utils.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 5ead6ef810d9..8b12ab6a0d6f 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -343,9 +343,7 @@ def _get_pipeline_class(
 
     diffusers_module = importlib.import_module(class_obj.__module__.split(".")[0])
     class_name = config["_class_name"]
-
-    if class_name.startswith("Flax"):
-        class_name = class_name[4:]
+    class_name = class_name[4:] if class_name.startswith("Flax") else class_name
 
     pipeline_cls = getattr(diffusers_module, class_name)
 
@@ -1083,8 +1081,7 @@ def load_module(name, value):
         # 6. Load each module in the pipeline
         for name, (library_name, class_name) in tqdm(init_dict.items(), desc="Loading pipeline components..."):
             # 6.1 - now that JAX/Flax is an official framework of the library, we might load from Flax names
-            if class_name.startswith("Flax"):
-                class_name = class_name[4:]
+            class_name = class_name[4:] if class_name.startswith("Flax") else class_name
 
             # 6.2 Define all importable classes
             is_pipeline_module = hasattr(pipelines, library_name)
@@ -1611,6 +1608,8 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
 
             # retrieve pipeline class from local file
             cls_name = cls.load_config(os.path.join(cached_folder, "model_index.json")).get("_class_name", None)
+            cls_name = cls_name[4:] if cls_name.startswith("Flax") else cls_name
+
             pipeline_class = getattr(diffusers, cls_name, None)
 
             if pipeline_class is not None and pipeline_class._load_connected_pipes:

From 0ea51627f190db401d195399051bc5015b6bc35a Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 13 Sep 2023 10:50:24 +0100
Subject: [PATCH 34/37] [Core] Fix dtype in InstructPix2Pix SDXL while
 computing `image_latents` (#5013)

* check out dtypes.

* check out dtypes.

* check out dtypes.

* check out dtypes.

* check out dtypes.

* check out dtypes.

* check out dtypes.

* potential fix

* check out dtypes.

* check out dtypes.

* working?
---
 .../pipeline_stable_diffusion_xl_instruct_pix2pix.py   | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
index 0f951c78cb23..63a45a45998e 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -495,7 +495,8 @@ def prepare_image_latents(
             image_latents = image
         else:
             # make sure the VAE is in float32 mode, as it overflows in float16
-            if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
                 self.upcast_vae()
                 image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
 
@@ -511,6 +512,10 @@ def prepare_image_latents(
             else:
                 image_latents = self.vae.encode(image).latent_dist.mode()
 
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+
         if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
             # expand image_latents for batch_size
             deprecation_message = (
@@ -533,6 +538,9 @@ def prepare_image_latents(
             uncond_image_latents = torch.zeros_like(image_latents)
             image_latents = torch.cat([image_latents, image_latents, uncond_image_latents], dim=0)
 
+        if image_latents.dtype != self.vae.dtype:
+            image_latents = image_latents.to(dtype=self.vae.dtype)
+
         return image_latents
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids

From 77373c5eb15ac3d1474bf49bf2ed30ac6bfcb670 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 13 Sep 2023 14:54:59 +0200
Subject: [PATCH 35/37] [Wuerstchen] fix compel usage (#4999)

* fix compel usage

* minor changes in documentation

* fix tests

* fix more

* fix more

* typos

* fix tests

* formatting

---------

Co-authored-by: Dominic Rampas <d6582533@gmail.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/pipelines/wuerstchen.md    |  16 +--
 .../wuerstchen/pipeline_wuerstchen.py         | 110 ++++++-----------
 .../pipeline_wuerstchen_combined.py           |  11 +-
 .../wuerstchen/pipeline_wuerstchen_prior.py   | 111 ++++++++++--------
 4 files changed, 109 insertions(+), 139 deletions(-)

diff --git a/docs/source/en/api/pipelines/wuerstchen.md b/docs/source/en/api/pipelines/wuerstchen.md
index 797248ebb4b7..3cd59f996bfd 100644
--- a/docs/source/en/api/pipelines/wuerstchen.md
+++ b/docs/source/en/api/pipelines/wuerstchen.md
@@ -8,9 +8,12 @@ The abstract from the paper is:
 
 *We introduce Würstchen, a novel technique for text-to-image synthesis that unites competitive performance with unprecedented cost-effectiveness and ease of training on constrained hardware. Building on recent advancements in machine learning, our approach, which utilizes latent diffusion strategies at strong latent image compression rates, significantly reduces the computational burden, typically associated with state-of-the-art models, while preserving, if not enhancing, the quality of generated images. Wuerstchen achieves notable speed improvements at inference time, thereby rendering real-time applications more viable. One of the key advantages of our method lies in its modest training requirements of only 9,200 GPU hours, slashing the usual costs significantly without compromising the end performance. In a comparison against the state-of-the-art, we found the approach to yield strong competitiveness. This paper opens the door to a new line of research that prioritizes both performance and computational accessibility, hence democratizing the use of sophisticated AI technologies. Through Wuerstchen, we demonstrate a compelling stride forward in the realm of text-to-image synthesis, offering an innovative path to explore in future research.*
 
+## Würstchen Overview
+Würstchen is a diffusion model, whose text-conditional model works in a highly compressed latent space of images. Why is this important? Compressing data can reduce computational costs for both training and inference by magnitudes. Training on 1024x1024 images is way more expensive than training on 32x32. Usually, other works make use of a relatively small compression, in the range of 4x - 8x spatial compression. Würstchen takes this to an extreme. Through its novel design, we achieve a 42x spatial compression. This was unseen before because common methods fail to faithfully reconstruct detailed images after 16x spatial compression. Würstchen employs a two-stage compression, what we call Stage A and Stage B. Stage A is a VQGAN, and Stage B is a Diffusion Autoencoder (more details can be found in the [paper](https://huggingface.co/papers/2306.00637)). A third model, Stage C, is learned in that highly compressed latent space. This training requires fractions of the compute used for current top-performing models, while also allowing cheaper and faster inference.
+
 ## Würstchen v2 comes to Diffusers
 
-After the initial paper release, we have improved numerous things in the architecture, training and sampling, making Würstchen competetive to current state-of-the-art models in many ways. We are excited to release this new version together with Diffusers. Here is a list of the improvements.
+After the initial paper release, we have improved numerous things in the architecture, training and sampling, making Würstchen competitive to current state-of-the-art models in many ways. We are excited to release this new version together with Diffusers. Here is a list of the improvements.
 
 - Higher resolution (1024x1024 up to 2048x2048)
 - Faster inference
@@ -22,16 +25,16 @@ We are releasing 3 checkpoints for the text-conditional image generation model (
 
 - v2-base
 - v2-aesthetic
-- v2-interpolated (50% interpolation between v2-base and v2-aesthetic)
+- **(default)** v2-interpolated (50% interpolation between v2-base and v2-aesthetic)
 
-We recommend to use v2-interpolated, as it has a nice touch of both photorealism and aesthetic. Use v2-base for finetunings as it does not have a style bias and use v2-aesthetic for very artistic generations.
+We recommend using v2-interpolated, as it has a nice touch of both photorealism and aesthetics. Use v2-base for finetunings as it does not have a style bias and use v2-aesthetic for very artistic generations.
 A comparison can be seen here:
 
 <img src="https://github.com/dome272/Wuerstchen/assets/61938694/2914830f-cbd3-461c-be64-d50734f4b49d" width=500>
 
 ## Text-to-Image Generation
 
-For the sake of usability Würstchen can be used with a single pipeline. This pipeline is called `WuerstchenCombinedPipeline` and can be used as follows:
+For the sake of usability, Würstchen can be used with a single pipeline. This pipeline can be used as follows:
 
 ```python
 import torch
@@ -85,7 +88,6 @@ decoder_output = decoder_pipeline(
     image_embeddings=prior_output.image_embeddings,
     prompt=caption,
     negative_prompt=negative_prompt,
-    num_images_per_prompt=num_images_per_prompt,
     guidance_scale=0.0,
     output_type="pil",
 ).images
@@ -95,8 +97,8 @@ decoder_output = decoder_pipeline(
 You can make use of `torch.compile` function and gain a speed-up of about 2-3x:
 
 ```python
-pipeline.prior = torch.compile(pipeline.prior, mode="reduce-overhead", fullgraph=True)
-pipeline.decoder = torch.compile(pipeline.decoder, mode="reduce-overhead", fullgraph=True)
+prior_pipeline.prior = torch.compile(prior_pipeline.prior, mode="reduce-overhead", fullgraph=True)
+decoder_pipeline.decoder = torch.compile(decoder_pipeline.decoder, mode="reduce-overhead", fullgraph=True)
 ```
 
 ## Limitations
diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
index 55e4b01f3da2..9ea6f979c239 100644
--- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
+++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
@@ -19,7 +19,7 @@
 from transformers import CLIPTextModel, CLIPTokenizer
 
 from ...schedulers import DDPMWuerstchenScheduler
-from ...utils import is_accelerate_available, is_accelerate_version, logging, replace_example_docstring
+from ...utils import logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .modeling_paella_vq_model import PaellaVQModel
@@ -72,6 +72,8 @@ class WuerstchenDecoderPipeline(DiffusionPipeline):
             width=int(24*10.67)=256 in order to match the training conditions.
     """
 
+    model_cpu_offload_seq = "text_encoder->decoder->vqgan"
+
     def __init__(
         self,
         tokenizer: CLIPTokenizer,
@@ -103,35 +105,6 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.decoder]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.prior_hook = hook
-
-        _, hook = cpu_offload_with_hook(self.vqgan, device, prev_module_hook=self.prior_hook)
-
-        self.final_offload_hook = hook
-
     def encode_prompt(
         self,
         prompt,
@@ -214,48 +187,6 @@ def encode_prompt(
             # to avoid doing two forward passes
         return text_encoder_hidden_states, uncond_text_encoder_hidden_states
 
-    def check_inputs(
-        self,
-        image_embeddings,
-        prompt,
-        negative_prompt,
-        num_inference_steps,
-        do_classifier_free_guidance,
-        device,
-        dtype,
-    ):
-        if not isinstance(prompt, list):
-            if isinstance(prompt, str):
-                prompt = [prompt]
-            else:
-                raise TypeError(f"'prompt' must be of type 'list' or 'str', but got {type(prompt)}.")
-
-        if do_classifier_free_guidance:
-            if negative_prompt is not None and not isinstance(negative_prompt, list):
-                if isinstance(negative_prompt, str):
-                    negative_prompt = [negative_prompt]
-                else:
-                    raise TypeError(
-                        f"'negative_prompt' must be of type 'list' or 'str', but got {type(negative_prompt)}."
-                    )
-
-        if isinstance(image_embeddings, list):
-            image_embeddings = torch.cat(image_embeddings, dim=0)
-        if isinstance(image_embeddings, np.ndarray):
-            image_embeddings = torch.Tensor(image_embeddings, device=device).to(dtype=dtype)
-        if not isinstance(image_embeddings, torch.Tensor):
-            raise TypeError(
-                f"'image_embeddings' must be of type 'torch.Tensor' or 'np.array', but got {type(image_embeddings)}."
-            )
-
-        if not isinstance(num_inference_steps, int):
-            raise TypeError(
-                f"'num_inference_steps' must be of type 'int', but got {type(num_inference_steps)}\
-                           In Case you want to provide explicit timesteps, please use the 'timesteps' argument."
-            )
-
-        return image_embeddings, prompt, negative_prompt, num_inference_steps
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -324,9 +255,35 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 1. Check inputs. Raise error if not correct
-        image_embeddings, prompt, negative_prompt, num_inference_steps = self.check_inputs(
-            image_embeddings, prompt, negative_prompt, num_inference_steps, do_classifier_free_guidance, device, dtype
-        )
+        if not isinstance(prompt, list):
+            if isinstance(prompt, str):
+                prompt = [prompt]
+            else:
+                raise TypeError(f"'prompt' must be of type 'list' or 'str', but got {type(prompt)}.")
+
+        if do_classifier_free_guidance:
+            if negative_prompt is not None and not isinstance(negative_prompt, list):
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+                else:
+                    raise TypeError(
+                        f"'negative_prompt' must be of type 'list' or 'str', but got {type(negative_prompt)}."
+                    )
+
+        if isinstance(image_embeddings, list):
+            image_embeddings = torch.cat(image_embeddings, dim=0)
+        if isinstance(image_embeddings, np.ndarray):
+            image_embeddings = torch.Tensor(image_embeddings, device=device).to(dtype=dtype)
+        if not isinstance(image_embeddings, torch.Tensor):
+            raise TypeError(
+                f"'image_embeddings' must be of type 'torch.Tensor' or 'np.array', but got {type(image_embeddings)}."
+            )
+
+        if not isinstance(num_inference_steps, int):
+            raise TypeError(
+                f"'num_inference_steps' must be of type 'int', but got {type(num_inference_steps)}\
+                           In Case you want to provide explicit timesteps, please use the 'timesteps' argument."
+            )
 
         # 2. Encode caption
         prompt_embeds, negative_prompt_embeds = self.encode_prompt(
@@ -390,6 +347,9 @@ def __call__(
         latents = self.vqgan.config.scale_factor * latents
         images = self.vqgan.decode(latents).sample.clamp(0, 1)
 
+        # Offload all models
+        self.maybe_free_model_hooks()
+
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(f"Only the output types `pt`, `np` and `pil` are supported not output_type={output_type}")
 
diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
index 0695e6379668..2a8614b21e15 100644
--- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
+++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
@@ -62,7 +62,7 @@ class WuerstchenCombinedPipeline(DiffusionPipeline):
             The prior tokenizer to be used for text inputs.
         prior_text_encoder (`CLIPTextModel`):
             The prior text encoder to be used for text inputs.
-        prior (`WuerstchenPrior`):
+        prior_prior (`WuerstchenPrior`):
             The prior model to be used for prior pipeline.
         prior_scheduler (`DDPMWuerstchenScheduler`):
             The scheduler to be used for prior pipeline.
@@ -119,8 +119,8 @@ def enable_model_cpu_offload(self, gpu_id=0):
         method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
         `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
         """
-        self.prior_pipe.enable_model_cpu_offload()
-        self.decoder_pipe.enable_model_cpu_offload()
+        self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
 
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
@@ -144,7 +144,7 @@ def set_progress_bar_config(self, **kwargs):
     @replace_example_docstring(TEXT2IMAGE_EXAMPLE_DOC_STRING)
     def __call__(
         self,
-        prompt: Union[str, List[str]],
+        prompt: Optional[Union[str, List[str]]] = None,
         height: int = 512,
         width: int = 512,
         prior_num_inference_steps: int = 60,
@@ -249,7 +249,7 @@ def __call__(
 
         outputs = self.decoder_pipe(
             image_embeddings=image_embeddings,
-            prompt=prompt,
+            prompt=prompt if prompt is not None else "",
             num_inference_steps=num_inference_steps,
             timesteps=decoder_timesteps,
             guidance_scale=decoder_guidance_scale,
@@ -258,4 +258,5 @@ def __call__(
             output_type=output_type,
             return_dict=return_dict,
         )
+
         return outputs
diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
index 46a6885c1f39..8e737a74bbfe 100644
--- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
+++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
@@ -23,8 +23,6 @@
 from ...schedulers import DDPMWuerstchenScheduler
 from ...utils import (
     BaseOutput,
-    is_accelerate_available,
-    is_accelerate_version,
     logging,
     replace_example_docstring,
 )
@@ -86,6 +84,8 @@ class WuerstchenPriorPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `prior` to generate image embedding.
     """
 
+    model_cpu_offload_seq = "text_encoder->prior"
+
     def __init__(
         self,
         tokenizer: CLIPTokenizer,
@@ -107,35 +107,6 @@ def __init__(
             latent_mean=latent_mean, latent_std=latent_std, resolution_multiple=resolution_multiple
         )
 
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.prior_hook = hook
-
-        _, hook = cpu_offload_with_hook(self.prior, device, prev_module_hook=self.prior_hook)
-
-        self.final_offload_hook = hook
-
     # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
@@ -249,22 +220,34 @@ def check_inputs(
         negative_prompt,
         num_inference_steps,
         do_classifier_free_guidance,
-        batch_size,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
     ):
-        if not isinstance(prompt, list):
-            if isinstance(prompt, str):
-                prompt = [prompt]
-            else:
-                raise TypeError(f"'prompt' must be of type 'list' or 'str', but got {type(prompt)}.")
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
-        if do_classifier_free_guidance:
-            if negative_prompt is not None and not isinstance(negative_prompt, list):
-                if isinstance(negative_prompt, str):
-                    negative_prompt = [negative_prompt]
-                else:
-                    raise TypeError(
-                        f"'negative_prompt' must be of type 'list' or 'str', but got {type(negative_prompt)}."
-                    )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
 
         if not isinstance(num_inference_steps, int):
             raise TypeError(
@@ -272,10 +255,6 @@ def check_inputs(
                            In Case you want to provide explicit timesteps, please use the 'timesteps' argument."
             )
 
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        return prompt, negative_prompt, num_inference_steps, batch_size
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -361,11 +340,36 @@ def __call__(
         # 0. Define commonly used variables
         device = self._execution_device
         do_classifier_free_guidance = guidance_scale > 1.0
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
 
         # 1. Check inputs. Raise error if not correct
-        prompt, negative_prompt, num_inference_steps, batch_size = self.check_inputs(
-            prompt, negative_prompt, num_inference_steps, do_classifier_free_guidance, batch_size
+        if prompt is not None and not isinstance(prompt, list):
+            if isinstance(prompt, str):
+                prompt = [prompt]
+            else:
+                raise TypeError(f"'prompt' must be of type 'list' or 'str', but got {type(prompt)}.")
+
+        if do_classifier_free_guidance:
+            if negative_prompt is not None and not isinstance(negative_prompt, list):
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+                else:
+                    raise TypeError(
+                        f"'negative_prompt' must be of type 'list' or 'str', but got {type(negative_prompt)}."
+                    )
+
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            num_inference_steps,
+            do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
         )
 
         # 2. Encode caption
@@ -437,6 +441,9 @@ def __call__(
         # 10. Denormalize the latents
         latents = latents * self.config.latent_mean - self.config.latent_std
 
+        # Offload all models
+        self.maybe_free_model_hooks()
+
         if output_type == "np":
             latents = latents.cpu().numpy()
 

From b954c22a446301c644120f41c046a4d4c4553c7a Mon Sep 17 00:00:00 2001
From: Lucain <lucainp@gmail.com>
Date: Wed, 13 Sep 2023 15:40:25 +0200
Subject: [PATCH 36/37] Fix broken link in docs (#5015)

fix broken link
---
 docs/source/en/api/pipelines/auto_pipeline.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/auto_pipeline.md b/docs/source/en/api/pipelines/auto_pipeline.md
index c0926997348c..68a0ede6d2fa 100644
--- a/docs/source/en/api/pipelines/auto_pipeline.md
+++ b/docs/source/en/api/pipelines/auto_pipeline.md
@@ -42,7 +42,7 @@ Check out the [AutoPipeline](/tutorials/autopipeline) tutorial to learn how to u
 `AutoPipeline` supports text-to-image, image-to-image, and inpainting for the following diffusion models:
 
 - [Stable Diffusion](./stable_diffusion)
-- [ControlNet](./api/pipelines/controlnet)
+- [ControlNet](./controlnet)
 - [Stable Diffusion XL (SDXL)](./stable_diffusion/stable_diffusion_xl)
 - [DeepFloyd IF](./if) 
 - [Kandinsky](./kandinsky)

From 19edca82f1ff194c07317369a92b470dbae97f34 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Wed, 13 Sep 2023 15:21:15 -0700
Subject: [PATCH 37/37] [docs] Create clearer optimization sections (#4870)

* refactor

* update general optim sections

* update more sections

* few more updates

* benchmark code
---
 docs/source/en/_toctree.yml                   |  50 ++-
 docs/source/en/optimization/fp16.md           | 379 +-----------------
 docs/source/en/optimization/habana.md         |  40 +-
 docs/source/en/optimization/memory.md         | 367 +++++++++++++++++
 docs/source/en/optimization/mps.md            |  64 +--
 docs/source/en/optimization/onnx.md           |  66 +--
 docs/source/en/optimization/open_vino.md      |  49 +--
 docs/source/en/optimization/opt_overview.md   |   4 +-
 docs/source/en/optimization/tome.md           | 123 +++---
 docs/source/en/optimization/torch2.0.md       | 133 +++---
 docs/source/en/optimization/xformers.md       |  12 +-
 .../stable_diffusion_jax_how_to.md            | 164 ++++----
 12 files changed, 679 insertions(+), 772 deletions(-)
 create mode 100644 docs/source/en/optimization/memory.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index a2a08b52db42..b56d9c094dab 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -113,27 +113,35 @@
 - sections:
   - local: optimization/opt_overview
     title: Overview
-  - local: optimization/fp16
-    title: Memory and Speed
-  - local: optimization/torch2.0
-    title: Torch2.0 support
-  - local: using-diffusers/stable_diffusion_jax_how_to
-    title: Stable Diffusion in JAX/Flax
-  - local: optimization/xformers
-    title: xFormers
-  - local: optimization/onnx
-    title: ONNX
-  - local: optimization/open_vino
-    title: OpenVINO
-  - local: optimization/coreml
-    title: Core ML
-  - local: optimization/mps
-    title: MPS
-  - local: optimization/habana
-    title: Habana Gaudi
-  - local: optimization/tome
-    title: Token Merging
-  title: Optimization/Special Hardware
+  - sections:
+    - local: optimization/fp16
+      title: Speed up inference
+    - local: optimization/memory
+      title: Reduce memory usage
+    - local: optimization/torch2.0
+      title: Torch 2.0
+    - local: optimization/xformers
+      title: xFormers
+    - local: optimization/tome
+      title: Token merging
+    title: General optimizations
+  - sections:
+    - local: using-diffusers/stable_diffusion_jax_how_to
+      title: JAX/Flax
+    - local: optimization/onnx
+      title: ONNX
+    - local: optimization/open_vino
+      title: OpenVINO
+    - local: optimization/coreml
+      title: Core ML
+    title: Optimized model types
+  - sections:
+    - local: optimization/mps
+      title: Metal Performance Shaders (MPS)
+    - local: optimization/habana
+      title: Habana Gaudi
+    title: Optimized hardware
+  title: Optimization
 - sections:
   - local: conceptual/philosophy
     title: Philosophy
diff --git a/docs/source/en/optimization/fp16.md b/docs/source/en/optimization/fp16.md
index 063ad8b305ae..2ac16786eb46 100644
--- a/docs/source/en/optimization/fp16.md
+++ b/docs/source/en/optimization/fp16.md
@@ -10,13 +10,19 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Memory and speed
+# Speed up inference
 
-We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for memory or speed. As a general rule, we recommend the use of [xFormers](https://github.com/facebookresearch/xformers) for memory efficient attention, please see the recommended [installation instructions](xformers).
+There are several ways to optimize 🤗 Diffusers for inference speed. As a general rule of thumb, we recommend using either [xFormers](xformers) or `torch.nn.functional.scaled_dot_product_attention` in PyTorch 2.0 for their memory-efficient attention. 
 
-We'll discuss how the following settings impact performance and memory.
+<Tip>
+
+In many cases, optimizing for speed or memory leads to improved performance in the other, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about preserving memory in the [Reduce memory usage](memory) guide.
+
+</Tip>
+
+The results below are obtained from generating a single 512x512 image from the prompt `a photo of an astronaut riding a horse on mars` with 50 DDIM steps on a Nvidia Titan RTX, demonstrating the speed-up you can expect.
 
-|                  | Latency | Speedup |
+|                  | latency | speed-up |
 | ---------------- | ------- | ------- |
 | original         | 9.50s   | x1      |
 | fp16             | 3.61s   | x2.63   |
@@ -24,15 +30,9 @@ We'll discuss how the following settings impact performance and memory.
 | traced UNet      | 3.21s   | x2.96   |
 | memory efficient attention  | 2.63s  | x3.61   |
 
-<em>
-  obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from
-  the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM
-  steps.
-</em>
-
-### Use tf32 instead of fp32 (on Ampere and later CUDA devices)
+## Use TensorFloat-32
 
-On Ampere and later CUDA devices matrix multiplications and convolutions can use the TensorFloat32 (TF32) mode for faster but slightly less accurate computations. By default PyTorch enables TF32 mode for convolutions but not matrix multiplications, and unless a network requires full float32 precision we recommend enabling this setting for matrix multiplications, too. It can significantly speed up computations with typically negligible loss of numerical accuracy. You can read more about it [here](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32). All you need to do is to add this before your inference:
+On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (TF32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables TF32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling TF32 for matrix multiplications. It can significantly speeds up computations with typically negligible loss in numerical accuracy.
 
 ```python
 import torch
@@ -40,9 +40,11 @@ import torch
 torch.backends.cuda.matmul.allow_tf32 = True
 ```
 
-## Half precision weights
+You can learn more about TF32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.
 
-To save more GPU memory and get more speed, you can load and run the model weights directly in half precision. This involves loading the float16 version of the weights, which was saved to a branch named `fp16`, and telling PyTorch to use the `float16` type when loading them:
+## Half-precision weights
+
+To save GPU memory and get more speed, try loading and running the model weights directly in half-precision or float16:
 
 ```Python
 import torch
@@ -61,351 +63,6 @@ image = pipe(prompt).images[0]
 
 <Tip warning={true}>
 
-  It is strongly discouraged to make use of [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than using pure 
-  float16 precision.
+Don't use [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.
   
-</Tip>
-
-## Sliced VAE decode for larger batches
-
-To decode large batches of images with limited VRAM, or to enable batches with 32 images or more, you can use sliced VAE decode that decodes the batch latents one image at a time.
-
-You likely want to couple this with [`~StableDiffusionPipeline.enable_xformers_memory_efficient_attention`] to further minimize memory use.
-
-To perform the VAE decode one image at a time, invoke [`~StableDiffusionPipeline.enable_vae_slicing`] in your pipeline before inference. For example:
-
-```Python
-import torch
-from diffusers import StableDiffusionPipeline
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-)
-pipe = pipe.to("cuda")
-
-prompt = "a photo of an astronaut riding a horse on mars"
-pipe.enable_vae_slicing()
-images = pipe([prompt] * 32).images
-```
-
-You may see a small performance boost in VAE decode on multi-image batches. There should be no performance impact on single-image batches.
-
-
-## Tiled VAE decode and encode for large images
-
-Tiled VAE processing makes it possible to work with large images on limited VRAM. For example, generating 4k images in 8GB of VRAM. Tiled VAE decoder splits the image into overlapping tiles, decodes the tiles, and blends the outputs to make the final image.
-
-You want to couple this with [`~StableDiffusionPipeline.enable_xformers_memory_efficient_attention`] to further minimize memory use.
-
-To use tiled VAE processing, invoke [`~StableDiffusionPipeline.enable_vae_tiling`] in your pipeline before inference. For example:
-
-```python
-import torch
-from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-)
-pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-pipe = pipe.to("cuda")
-prompt = "a beautiful landscape photograph"
-pipe.enable_vae_tiling()
-pipe.enable_xformers_memory_efficient_attention()
-
-image = pipe([prompt], width=3840, height=2224, num_inference_steps=20).images[0]
-```
-
-The output image will have some tile-to-tile tone variation from the tiles having separate decoders, but you shouldn't see sharp seams between the tiles. The tiling is turned off for images that are 512x512 or smaller.
-
-
-<a name="sequential_offloading"></a>
-## Offloading to CPU with accelerate for memory savings
-
-For additional memory savings, you can offload the weights to CPU and only load them to GPU when performing the forward pass.
-
-To perform CPU offloading, all you have to do is invoke [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]:
-
-```Python
-import torch
-from diffusers import StableDiffusionPipeline
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-)
-
-prompt = "a photo of an astronaut riding a horse on mars"
-pipe.enable_sequential_cpu_offload()
-image = pipe(prompt).images[0]
-```
-
-And you can get the memory consumption to < 3GB.
-
-Note that this method works at the submodule level, not on whole models. This is the best way to minimize memory consumption, but inference is much slower due to the iterative nature of the process. The UNet component of the pipeline runs several times (as many as `num_inference_steps`); each time, the different submodules of the UNet are sequentially onloaded and then offloaded as they are needed, so the number of memory transfers is large.
-
-<Tip>
-Consider using <a href="#model_offloading">model offloading</a> as another point in the optimization space: it will be much faster, but memory savings won't be as large.
-</Tip>
-
-It is also possible to chain offloading with attention slicing for minimal memory consumption (< 2GB).
-
-```Python
-import torch
-from diffusers import StableDiffusionPipeline
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-)
-
-prompt = "a photo of an astronaut riding a horse on mars"
-pipe.enable_sequential_cpu_offload()
-
-image = pipe(prompt).images[0]
-```
-
-**Note**: When using `enable_sequential_cpu_offload()`, it is important to **not** move the pipeline to CUDA beforehand or else the gain in memory consumption will only be minimal. See [this issue](https://github.com/huggingface/diffusers/issues/1934) for more information.
-
-**Note**: `enable_sequential_cpu_offload()` is a stateful operation that installs hooks on the models.
-
-
-<a name="model_offloading"></a>
-## Model offloading for fast inference and memory savings
-
-[Sequential CPU offloading](#sequential_offloading), as discussed in the previous section, preserves a lot of memory but makes inference slower, because submodules are moved to GPU as needed, and immediately returned to CPU when a new module runs.
-
-Full-model offloading is an alternative that moves whole models to the GPU, instead of handling each model's constituent _modules_. This results in a negligible impact on inference time (compared with moving the pipeline to `cuda`), while still providing some memory savings.
-
-In this scenario, only one of the main components of the pipeline (typically: text encoder, unet and vae)
-will be in the GPU while the others wait in the CPU. Components like the UNet that run for multiple iterations will stay on GPU until they are no longer needed.
-
-This feature can be enabled by invoking `enable_model_cpu_offload()` on the pipeline, as shown below.
-
-```Python
-import torch
-from diffusers import StableDiffusionPipeline
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",  
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-)
-
-prompt = "a photo of an astronaut riding a horse on mars"
-pipe.enable_model_cpu_offload()
-image = pipe(prompt).images[0]
-```
-
-This is also compatible with attention slicing for additional memory savings.
-
-```Python
-import torch
-from diffusers import StableDiffusionPipeline
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-)
-
-prompt = "a photo of an astronaut riding a horse on mars"
-pipe.enable_model_cpu_offload()
-
-image = pipe(prompt).images[0]
-```
-
-<Tip>
-This feature requires `accelerate` version 0.17.0 or larger.
-</Tip>
-
-**Note**: `enable_model_cpu_offload()` is a stateful operation that installs hooks on the models and state on the pipeline. In order to properly offload
-models after they are called, it is required that the entire pipeline is run and models are called in the order the pipeline expects them to be. Exercise caution
-if models are re-used outside the context of the pipeline after hooks have been installed. See [accelerate](https://huggingface.co/docs/accelerate/v0.18.0/en/package_reference/big_modeling#accelerate.hooks.remove_hook_from_module)
-for further docs on removing hooks.
-
-## Using Channels Last memory format
-
-Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). Since not all operators currently support channels last format it may result in a worst performance, so it's better to try it and see if it works for your model.
-
-For example, in order to set the UNet model in our pipeline to use channels last format, we can use the following:
-
-```python
-print(pipe.unet.conv_out.state_dict()["weight"].stride())  # (2880, 9, 3, 1)
-pipe.unet.to(memory_format=torch.channels_last)  # in-place operation
-print(
-    pipe.unet.conv_out.state_dict()["weight"].stride()
-)  # (2880, 1, 960, 320) having a stride of 1 for the 2nd dimension proves that it works
-```
-
-## Tracing
-
-Tracing runs an example input tensor through your model, and captures the operations that are invoked as that input makes its way through the model's layers so that an executable or `ScriptFunction` is returned that will be optimized using just-in-time compilation.
-
-To trace our UNet model, we can use the following:
-
-```python
-import time
-import torch
-from diffusers import StableDiffusionPipeline
-import functools
-
-# torch disable grad
-torch.set_grad_enabled(False)
-
-# set variables
-n_experiments = 2
-unet_runs_per_experiment = 50
-
-
-# load inputs
-def generate_inputs():
-    sample = torch.randn(2, 4, 64, 64).half().cuda()
-    timestep = torch.rand(1).half().cuda() * 999
-    encoder_hidden_states = torch.randn(2, 77, 768).half().cuda()
-    return sample, timestep, encoder_hidden_states
-
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-).to("cuda")
-unet = pipe.unet
-unet.eval()
-unet.to(memory_format=torch.channels_last)  # use channels_last memory format
-unet.forward = functools.partial(unet.forward, return_dict=False)  # set return_dict=False as default
-
-# warmup
-for _ in range(3):
-    with torch.inference_mode():
-        inputs = generate_inputs()
-        orig_output = unet(*inputs)
-
-# trace
-print("tracing..")
-unet_traced = torch.jit.trace(unet, inputs)
-unet_traced.eval()
-print("done tracing")
-
-
-# warmup and optimize graph
-for _ in range(5):
-    with torch.inference_mode():
-        inputs = generate_inputs()
-        orig_output = unet_traced(*inputs)
-
-
-# benchmarking
-with torch.inference_mode():
-    for _ in range(n_experiments):
-        torch.cuda.synchronize()
-        start_time = time.time()
-        for _ in range(unet_runs_per_experiment):
-            orig_output = unet_traced(*inputs)
-        torch.cuda.synchronize()
-        print(f"unet traced inference took {time.time() - start_time:.2f} seconds")
-    for _ in range(n_experiments):
-        torch.cuda.synchronize()
-        start_time = time.time()
-        for _ in range(unet_runs_per_experiment):
-            orig_output = unet(*inputs)
-        torch.cuda.synchronize()
-        print(f"unet inference took {time.time() - start_time:.2f} seconds")
-
-# save the model
-unet_traced.save("unet_traced.pt")
-```
-
-Then we can replace the `unet` attribute of the pipeline with the traced model like the following
-
-```python
-from diffusers import StableDiffusionPipeline
-import torch
-from dataclasses import dataclass
-
-
-@dataclass
-class UNet2DConditionOutput:
-    sample: torch.FloatTensor
-
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-).to("cuda")
-
-# use jitted unet
-unet_traced = torch.jit.load("unet_traced.pt")
-
-
-# del pipe.unet
-class TracedUNet(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.in_channels = pipe.unet.in_channels
-        self.device = pipe.unet.device
-
-    def forward(self, latent_model_input, t, encoder_hidden_states):
-        sample = unet_traced(latent_model_input, t, encoder_hidden_states)[0]
-        return UNet2DConditionOutput(sample=sample)
-
-
-pipe.unet = TracedUNet()
-
-with torch.inference_mode():
-    image = pipe([prompt] * 1, num_inference_steps=50).images[0]
-```
-
-
-## Memory Efficient Attention
-
-Recent work on optimizing the bandwitdh in the attention block has generated huge speed ups and gains in GPU memory usage. The most recent being Flash Attention from @tridao: [code](https://github.com/HazyResearch/flash-attention), [paper](https://arxiv.org/pdf/2205.14135.pdf).
-
-Here are the speedups we obtain on a few Nvidia GPUs when running the inference at 512x512 with a batch size of 1 (one prompt):
-
-| GPU              	| Base Attention FP16 	| Memory Efficient Attention FP16 	|
-|------------------	|---------------------	|---------------------------------	|
-| NVIDIA Tesla T4  	| 3.5it/s             	| 5.5it/s                         	|
-| NVIDIA 3060 RTX  	| 4.6it/s             	| 7.8it/s                         	|
-| NVIDIA A10G      	| 8.88it/s            	| 15.6it/s                        	|
-| NVIDIA RTX A6000 	| 11.7it/s            	| 21.09it/s                       	|
-| NVIDIA TITAN RTX  | 12.51it/s         	| 18.22it/s                       	|
-| A100-SXM4-40GB    	| 18.6it/s            	| 29.it/s                        	|
-| A100-SXM-80GB    	| 18.7it/s            	| 29.5it/s                        	|
-
-To leverage it just make sure you have:
-
-<Tip warning={true}>
-
-If you have PyTorch 2.0 installed, you shouldn't use xFormers!
-
-</Tip>
-
- - PyTorch > 1.12
- - Cuda available
- - [Installed the xformers library](xformers).
-```python
-from diffusers import DiffusionPipeline
-import torch
-
-pipe = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-).to("cuda")
-
-pipe.enable_xformers_memory_efficient_attention()
-
-with torch.inference_mode():
-    sample = pipe("a small cat")
-
-# optional: You can disable it via
-# pipe.disable_xformers_memory_efficient_attention()
-```
+</Tip>
\ No newline at end of file
diff --git a/docs/source/en/optimization/habana.md b/docs/source/en/optimization/habana.md
index 24846615c95c..c78c8ca3a1be 100644
--- a/docs/source/en/optimization/habana.md
+++ b/docs/source/en/optimization/habana.md
@@ -10,25 +10,22 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# How to use Stable Diffusion on Habana Gaudi
+# Habana Gaudi
 
-🤗 Diffusers is compatible with Habana Gaudi through 🤗 [Optimum Habana](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion).
+🤗 Diffusers is compatible with Habana Gaudi through 🤗 [Optimum](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion). Follow the [installation](https://docs.habana.ai/en/latest/Installation_Guide/index.html) guide to install the SynapseAI and Gaudi drivers, and then install Optimum Habana:
 
-## Requirements
+```bash
+python -m pip install --upgrade-strategy eager optimum[habana]
+```
 
-- Optimum Habana 1.6 or later, [here](https://huggingface.co/docs/optimum/habana/installation) is how to install it.
-- SynapseAI 1.10.
+To generate images with Stable Diffusion 1 and 2 on Gaudi, you need to instantiate two instances:
 
+- [`~optimum.habana.diffusers.GaudiStableDiffusionPipeline`], a pipeline for text-to-image generation.
+- [`~optimum.habana.diffusers.GaudiDDIMScheduler`], a Gaudi-optimized scheduler.
 
-## Inference Pipeline
+When you initialize the pipeline, you have to specify `use_habana=True` to deploy it on HPUs and to get the fastest possible generation, you should enable **HPU graphs** with `use_hpu_graphs=True`.
 
-To generate images with Stable Diffusion 1 and 2 on Gaudi, you need to instantiate two instances:
-- A pipeline with [`GaudiStableDiffusionPipeline`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline). This pipeline supports *text-to-image generation*.
-- A scheduler with [`GaudiDDIMScheduler`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline#optimum.habana.diffusers.GaudiDDIMScheduler). This scheduler has been optimized for Habana Gaudi.
-
-When initializing the pipeline, you have to specify `use_habana=True` to deploy it on HPUs.
-Furthermore, in order to get the fastest possible generations you should enable **HPU graphs** with `use_hpu_graphs=True`.
-Finally, you will need to specify a [Gaudi configuration](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config) which can be downloaded from the [Hugging Face Hub](https://huggingface.co/Habana).
+Finally, specify a [`~optimum.habana.GaudiConfig`] which can be downloaded from the [Habana](https://huggingface.co/Habana) organization on the Hub.
 
 ```python
 from optimum.habana import GaudiConfig
@@ -45,7 +42,8 @@ pipeline = GaudiStableDiffusionPipeline.from_pretrained(
 )
 ```
 
-You can then call the pipeline to generate images by batches from one or several prompts:
+Now you can call the pipeline to generate images by batches from one or several prompts:
+
 ```python
 outputs = pipeline(
     prompt=[
@@ -57,21 +55,21 @@ outputs = pipeline(
 )
 ```
 
-For more information, check out Optimum Habana's [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion) and the [example](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion) provided in the official Github repository.
+For more information, check out 🤗 Optimum Habana's [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion) and the [example](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion) provided in the official Github repository.
 
 
 ## Benchmark
 
-Here are the latencies for Habana first-generation Gaudi and Gaudi2 with the [Habana/stable-diffusion](https://huggingface.co/Habana/stable-diffusion) and [Habana/stable-diffusion-2](https://huggingface.co/Habana/stable-diffusion-2) Gaudi configurations (mixed precision bf16/fp32):
+We benchmarked Habana's first-generation Gaudi and Gaudi2 with the [Habana/stable-diffusion](https://huggingface.co/Habana/stable-diffusion) and [Habana/stable-diffusion-2](https://huggingface.co/Habana/stable-diffusion-2) Gaudi configurations (mixed precision bf16/fp32) to demonstrate their performance.
 
-- [Stable Diffusion v1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5) (512x512 resolution):
+For [Stable Diffusion v1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5) on 512x512 images:
 
-|                        | Latency (batch size = 1) | Throughput (batch size = 8) |
+|                        | Latency (batch size = 1) | Throughput  |
 | ---------------------- |:------------------------:|:---------------------------:|
-| first-generation Gaudi | 3.80s                    | 0.308 images/s              |
-| Gaudi2                 | 1.33s                    | 1.081 images/s              |
+| first-generation Gaudi | 3.80s                    | 0.308 images/s (batch size = 8)             |
+| Gaudi2                 | 1.33s                    | 1.081 images/s (batch size = 8)             |
 
-- [Stable Diffusion v2.1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (768x768 resolution):
+For [Stable Diffusion v2.1](https://huggingface.co/stabilityai/stable-diffusion-2-1) on 768x768 images:
 
 |                        | Latency (batch size = 1) | Throughput                      |
 | ---------------------- |:------------------------:|:-------------------------------:|
diff --git a/docs/source/en/optimization/memory.md b/docs/source/en/optimization/memory.md
new file mode 100644
index 000000000000..25c621231dcd
--- /dev/null
+++ b/docs/source/en/optimization/memory.md
@@ -0,0 +1,367 @@
+# Reduce memory usage
+
+A barrier to using diffusion models is the large amount of memory required. To overcome this challenge, there are several memory-reducing techniques you can use to run even some of the largest models on free-tier or consumer GPUs. Some of these techniques can even be combined to further reduce memory usage.
+
+<Tip>
+
+In many cases, optimizing for memory or speed leads to improved performance in the other, so you should try to optimize for both whenever you can. This guide focuses on minimizing memory usage, but you can also learn more about how to [Speed up inference](fp16).
+
+</Tip>
+
+The results below are obtained from generating a single 512x512 image from the prompt a photo of an astronaut riding a horse on mars with 50 DDIM steps on a Nvidia Titan RTX, demonstrating the speed-up you can expect as a result of reduced memory consumption.
+
+|                  | latency | speed-up |
+| ---------------- | ------- | ------- |
+| original         | 9.50s   | x1      |
+| fp16             | 3.61s   | x2.63   |
+| channels last    | 3.30s   | x2.88   |
+| traced UNet      | 3.21s   | x2.96   |
+| memory-efficient attention  | 2.63s  | x3.61   |
+
+
+## Sliced VAE
+
+Sliced VAE enables decoding large batches of images with limited VRAM or batches with 32 images or more by decoding the batches of latents one image at a time. You'll likely want to couple this with [`~ModelMixin.enable_xformers_memory_efficient_attention`] to further reduce memory use.
+
+To use sliced VAE, call [`~StableDiffusionPipeline.enable_vae_slicing`] on your pipeline before inference:
+
+```python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+)
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_vae_slicing()
+images = pipe([prompt] * 32).images
+```
+
+You may see a small performance boost in VAE decoding on multi-image batches, and there should be no performance impact on single-image batches.
+
+## Tiled VAE
+
+Tiled VAE processing also enables working with large images on limited VRAM (for example, generating 4k images on 8GB of VRAM) by splitting the image into overlapping tiles, decoding the tiles, and then blending the outputs together to compose the final image. You should also used tiled VAE with [`~ModelMixin.enable_xformers_memory_efficient_attention`] to further reduce memory use.
+
+To use tiled VAE processing, call [`~StableDiffusionPipeline.enable_vae_tiling`] on your pipeline before inference:
+
+```python
+import torch
+from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+)
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+prompt = "a beautiful landscape photograph"
+pipe.enable_vae_tiling()
+pipe.enable_xformers_memory_efficient_attention()
+
+image = pipe([prompt], width=3840, height=2224, num_inference_steps=20).images[0]
+```
+
+The output image has some tile-to-tile tone variation because the tiles are decoded separately, but you shouldn't see any sharp and obvious seams between the tiles. Tiling is turned off for images that are 512x512 or smaller.
+
+## CPU offloading
+
+Offloading the weights to the CPU and only loading them on the GPU when performing the forward pass can also save memory. Often, this technique can reduce memory consumption to less than 3GB.
+
+To perform CPU offloading, call [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]:
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_sequential_cpu_offload()
+image = pipe(prompt).images[0]
+```
+
+CPU offloading works on submodules rather than whole models. This is the best way to minimize memory consumption, but inference is much slower due to the iterative nature of the diffusion process. The UNet component of the pipeline runs several times (as many as `num_inference_steps`); each time, the different UNet submodules are sequentially onloaded and offloaded as needed, resulting in a large number of memory transfers.
+
+<Tip>
+
+Consider using [model offloading](#model-offloading) if you want to optimize for speed because it is much faster. The tradeoff is your memory savings won't be as large.
+
+</Tip>
+
+CPU offloading can also be chained with attention slicing to reduce memory consumption to less than 2GB.
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_sequential_cpu_offload()
+
+image = pipe(prompt).images[0]
+```
+
+<Tip warning={true}>
+
+When using [`~StableDiffusionPipeline.enable_sequential_cpu_offload`], don't move the pipeline to CUDA beforehand or else the gain in memory consumption will only be minimal (see this [issue](https://github.com/huggingface/diffusers/issues/1934) for more information).
+
+[`~StableDiffusionPipeline.enable_sequential_cpu_offload`] is a stateful operation that installs hooks on the models.
+
+</Tip>
+
+## Model offloading
+
+<Tip>
+
+Model offloading requires 🤗 Accelerate version 0.17.0 or higher.
+
+</Tip>
+
+[Sequential CPU offloading](#cpu-offloading) preserves a lot of memory but it makes inference slower because submodules are moved to GPU as needed, and they're immediately returned to the CPU when a new module runs.
+
+Full-model offloading is an alternative that moves whole models to the GPU, instead of handling each model's constituent *submodules*. There is a negligible impact on inference time (compared with moving the pipeline to `cuda`), and it still provides some memory savings.
+
+During model offloading, only one of the main components of the pipeline (typically the text encoder, UNet and VAE)
+is placed on the GPU while the others wait on the CPU. Components like the UNet that run for multiple iterations stay on the GPU until they're no longer needed.
+
+Enable model offloading by calling [`~StableDiffusionPipeline.enable_model_cpu_offload`] on the pipeline:
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",  
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_model_cpu_offload()
+image = pipe(prompt).images[0]
+```
+
+Model offloading can also be combined with attention slicing for additional memory savings.
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_model_cpu_offload()
+
+image = pipe(prompt).images[0]
+```
+
+<Tip warning={true}>
+
+In order to properly offload models after they're called, it is required to run the entire pipeline and models are called in the pipeline's expected order. Exercise caution if models are reused outside the context of the pipeline after hooks have been installed. See [Removing Hooks](https://huggingface.co/docs/accelerate/en/package_reference/big_modeling#accelerate.hooks.remove_hook_from_module)
+for more information.
+
+[`~StableDiffusionPipeline.enable_model_cpu_offload`] is a stateful operation that installs hooks on the models and state on the pipeline.
+
+</Tip>
+
+## Channels-last memory format
+
+The channels-last memory format is an alternative way of ordering NCHW tensors in memory to preserve dimension ordering. Channels-last tensors are ordered in such a way that the channels become the densest dimension (storing images pixel-per-pixel). Since not all operators currently support the channels-last format, it may result in worst performance but you should still try and see if it works for your model.
+
+For example, to set the pipeline's UNet to use the channels-last format:
+
+```python
+print(pipe.unet.conv_out.state_dict()["weight"].stride())  # (2880, 9, 3, 1)
+pipe.unet.to(memory_format=torch.channels_last)  # in-place operation
+print(
+    pipe.unet.conv_out.state_dict()["weight"].stride()
+)  # (2880, 1, 960, 320) having a stride of 1 for the 2nd dimension proves that it works
+```
+
+## Tracing
+
+Tracing runs an example input tensor through the model and captures the operations that are performed on it as that input makes its way through the model's layers. The executable or `ScriptFunction` that is returned is optimized with just-in-time compilation.
+
+To trace a UNet:
+
+```python
+import time
+import torch
+from diffusers import StableDiffusionPipeline
+import functools
+
+# torch disable grad
+torch.set_grad_enabled(False)
+
+# set variables
+n_experiments = 2
+unet_runs_per_experiment = 50
+
+
+# load inputs
+def generate_inputs():
+    sample = torch.randn(2, 4, 64, 64).half().cuda()
+    timestep = torch.rand(1).half().cuda() * 999
+    encoder_hidden_states = torch.randn(2, 77, 768).half().cuda()
+    return sample, timestep, encoder_hidden_states
+
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+).to("cuda")
+unet = pipe.unet
+unet.eval()
+unet.to(memory_format=torch.channels_last)  # use channels_last memory format
+unet.forward = functools.partial(unet.forward, return_dict=False)  # set return_dict=False as default
+
+# warmup
+for _ in range(3):
+    with torch.inference_mode():
+        inputs = generate_inputs()
+        orig_output = unet(*inputs)
+
+# trace
+print("tracing..")
+unet_traced = torch.jit.trace(unet, inputs)
+unet_traced.eval()
+print("done tracing")
+
+
+# warmup and optimize graph
+for _ in range(5):
+    with torch.inference_mode():
+        inputs = generate_inputs()
+        orig_output = unet_traced(*inputs)
+
+
+# benchmarking
+with torch.inference_mode():
+    for _ in range(n_experiments):
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(unet_runs_per_experiment):
+            orig_output = unet_traced(*inputs)
+        torch.cuda.synchronize()
+        print(f"unet traced inference took {time.time() - start_time:.2f} seconds")
+    for _ in range(n_experiments):
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(unet_runs_per_experiment):
+            orig_output = unet(*inputs)
+        torch.cuda.synchronize()
+        print(f"unet inference took {time.time() - start_time:.2f} seconds")
+
+# save the model
+unet_traced.save("unet_traced.pt")
+```
+
+Replace the `unet` attribute of the pipeline with the traced model:
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+from dataclasses import dataclass
+
+
+@dataclass
+class UNet2DConditionOutput:
+    sample: torch.FloatTensor
+
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+).to("cuda")
+
+# use jitted unet
+unet_traced = torch.jit.load("unet_traced.pt")
+
+
+# del pipe.unet
+class TracedUNet(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.in_channels = pipe.unet.in_channels
+        self.device = pipe.unet.device
+
+    def forward(self, latent_model_input, t, encoder_hidden_states):
+        sample = unet_traced(latent_model_input, t, encoder_hidden_states)[0]
+        return UNet2DConditionOutput(sample=sample)
+
+
+pipe.unet = TracedUNet()
+
+with torch.inference_mode():
+    image = pipe([prompt] * 1, num_inference_steps=50).images[0]
+```
+
+## Memory-efficient attention
+
+Recent work on optimizing bandwidth in the attention block has generated huge speed-ups and reductions in GPU memory usage. The most recent type of memory-efficient attention is [Flash Attention](https://arxiv.org/pdf/2205.14135.pdf) (you can check out the original code at [HazyResearch/flash-attention](https://github.com/HazyResearch/flash-attention)).
+
+The table below details the speed-ups from a few different Nvidia GPUs when running inference on image sizes of 512x512 and a batch size of 1 (one prompt):
+
+| GPU              | base attention (fp16) | memory-efficient attention (fp16) |
+|------------------|-----------------------|-----------------------------------|
+| NVIDIA Tesla T4  |               3.5it/s |                           5.5it/s |
+| NVIDIA 3060 RTX  |               4.6it/s |                           7.8it/s |
+| NVIDIA A10G      |              8.88it/s |                          15.6it/s |
+| NVIDIA RTX A6000 |              11.7it/s |                         21.09it/s |
+| NVIDIA TITAN RTX |             12.51it/s |                         18.22it/s |
+| A100-SXM4-40GB   |              18.6it/s |                           29.it/s |
+| A100-SXM-80GB    |              18.7it/s |                          29.5it/s |
+
+<Tip warning={true}>
+
+If you have PyTorch 2.0 installed, you shouldn't use xFormers!
+
+</Tip>
+
+To use Flash Attention, install the following:
+
+- PyTorch > 1.12
+- CUDA available
+- [xFormers](xformers)
+
+Then call [`~ModelMixin.enable_xformers_memory_efficient_attention`] on the pipeline:
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+).to("cuda")
+
+pipe.enable_xformers_memory_efficient_attention()
+
+with torch.inference_mode():
+    sample = pipe("a small cat")
+
+# optional: You can disable it via
+# pipe.disable_xformers_memory_efficient_attention()
+```
diff --git a/docs/source/en/optimization/mps.md b/docs/source/en/optimization/mps.md
index 3be8c621ee3e..138c85b51184 100644
--- a/docs/source/en/optimization/mps.md
+++ b/docs/source/en/optimization/mps.md
@@ -10,29 +10,16 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# How to use Stable Diffusion in Apple Silicon (M1/M2)
+# Metal Performance Shaders (MPS)
 
-🤗 Diffusers is compatible with Apple silicon for Stable Diffusion inference, using the PyTorch `mps` device. These are the steps you need to follow to use your M1 or M2 computer with Stable Diffusion.
+🤗 Diffusers is compatible with Apple silicon (M1/M2 chips) using the PyTorch [`mps`](https://pytorch.org/docs/stable/notes/mps.html) device, which uses the Metal framework to leverage the GPU on MacOS devices. You'll need to have:
 
-## Requirements
+- macOS computer with Apple silicon (M1/M2) hardware
+- macOS 12.6 or later (13.0 or later recommended)
+- arm64 version of Python
+- [PyTorch 2.0](https://pytorch.org/get-started/locally/) (recommended) or 1.13 (minimum version supported for `mps`)
 
-- Mac computer with Apple silicon (M1/M2) hardware.
-- macOS 12.6 or later (13.0 or later recommended).
-- arm64 version of Python.
-- PyTorch 2.0 (recommended) or 1.13 (minimum version supported for `mps`). You can install it with `pip` or `conda` using the instructions in https://pytorch.org/get-started/locally/.
-
-
-## Inference Pipeline
-
-The snippet below demonstrates how to use the `mps` backend using the familiar `to()` interface to move the Stable Diffusion pipeline to your M1 or M2 device.
-
-<Tip warning={true}>
-
-**If you are using PyTorch 1.13** you need to "prime" the pipeline using an additional one-time pass through it. This is a temporary workaround for a weird issue we detected: the first inference pass produces slightly different results than subsequent ones. You only need to do this pass once, and it's ok to use just one inference step and discard the result.
-
-</Tip>
-
-We strongly recommend you use PyTorch 2 or better, as it solves a number of problems like the one described in the previous tip.
+The `mps` backend uses PyTorch's `.to()` interface to move the Stable Diffusion pipeline on to your M1 or M2 device:
 
 ```python
 from diffusers import DiffusionPipeline
@@ -44,24 +31,41 @@ pipe = pipe.to("mps")
 pipe.enable_attention_slicing()
 
 prompt = "a photo of an astronaut riding a horse on mars"
+```
+
+<Tip warning={true}>
+
+Generating multiple prompts in a batch can [crash](https://github.com/huggingface/diffusers/issues/363) or fail to work reliably. We believe this is related to the [`mps`](https://github.com/pytorch/pytorch/issues/84039) backend in PyTorch. While this is being investigated, you should iterate instead of batching.
+
+</Tip>
+
+If you're using **PyTorch 1.13**, you need to "prime" the pipeline with an additional one-time pass through it. This is a temporary workaround for an issue where the first inference pass produces slightly different results than subsequent ones. You only need to do this pass once, and after just one inference step you can discard the result.
+
+```diff
+  from diffusers import DiffusionPipeline
 
-# First-time "warmup" pass if PyTorch version is 1.13 (see explanation above)
-_ = pipe(prompt, num_inference_steps=1)
+  pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to("mps")
+  pipe.enable_attention_slicing()
+
+  prompt = "a photo of an astronaut riding a horse on mars"
+# First-time "warmup" pass if PyTorch version is 1.13
++ _ = pipe(prompt, num_inference_steps=1)
 
 # Results match those from the CPU device after the warmup pass.
-image = pipe(prompt).images[0]
+  image = pipe(prompt).images[0]
 ```
 
-## Performance Recommendations
+## Troubleshoot
 
-M1/M2 performance is very sensitive to memory pressure. The system will automatically swap if it needs to, but performance will degrade significantly when it does.
+M1/M2 performance is very sensitive to memory pressure. When this occurs, the system automatically swaps if it needs to which significantly degrades performance.
 
-We recommend you use _attention slicing_ to reduce memory pressure during inference and prevent swapping, particularly if your computer has less than 64 GB of system RAM, or if you generate images at non-standard resolutions larger than 512 × 512 pixels. Attention slicing performs the costly attention operation in multiple steps instead of all at once. It usually has a performance impact of ~20% in computers without universal memory, but we have observed _better performance_ in most Apple Silicon computers, unless you have 64 GB or more.
+To prevent this from happening, we recommend *attention slicing* to reduce memory pressure during inference and prevent swapping. This is especially relevant if your computer has less than 64GB of system RAM, or if you generate images at non-standard resolutions larger than 512×512 pixels. Call the [`~DiffusionPipeline.enable_attention_slicing`] function on your pipeline:
 
-```python
+```py
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True).to("mps")
 pipeline.enable_attention_slicing()
 ```
 
-## Known Issues
-
-- Generating multiple prompts in a batch [crashes or doesn't work reliably](https://github.com/huggingface/diffusers/issues/363). We believe this is related to the [`mps` backend in PyTorch](https://github.com/pytorch/pytorch/issues/84039). This is being resolved, but for now we recommend to iterate instead of batching.
+Attention slicing performs the costly attention operation in multiple steps instead of all at once. It usually improves performance by ~20% in computers without universal memory, but we've observed *better performance* in most Apple silicon computers unless you have 64GB of RAM or more.
diff --git a/docs/source/en/optimization/onnx.md b/docs/source/en/optimization/onnx.md
index 2e39fabd1e27..20104b555543 100644
--- a/docs/source/en/optimization/onnx.md
+++ b/docs/source/en/optimization/onnx.md
@@ -11,23 +11,19 @@ specific language governing permissions and limitations under the License.
 -->
 
 
-# How to use ONNX Runtime for inference
+# ONNX Runtime
 
-🤗 [Optimum](https://github.com/huggingface/optimum) provides a Stable Diffusion pipeline compatible with ONNX Runtime. 
+🤗 [Optimum](https://github.com/huggingface/optimum) provides a Stable Diffusion pipeline compatible with ONNX Runtime. You'll need to install 🤗 Optimum with the following command for ONNX Runtime support:
 
-## Installation
-
-Install 🤗 Optimum with the following command for ONNX Runtime support:
-
-```
+```bash
 pip install optimum["onnxruntime"]
 ```
 
-## Stable Diffusion
+This guide will show you how to use the Stable Diffusion and Stable Diffusion XL (SDXL) pipelines with ONNX Runtime.
 
-### Inference
+## Stable Diffusion
 
-To load an ONNX model and run inference with ONNX Runtime, you need to replace [`StableDiffusionPipeline`] with `ORTStableDiffusionPipeline`. In case you want to load a PyTorch model and convert it to the ONNX format on-the-fly, you can set `export=True`.
+To load and run inference, use the [`~optimum.onnxruntime.ORTStableDiffusionPipeline`]. If you want to load a PyTorch model and convert it to the ONNX format on-the-fly, set `export=True`:
 
 ```python
 from optimum.onnxruntime import ORTStableDiffusionPipeline
@@ -39,14 +35,20 @@ image = pipeline(prompt).images[0]
 pipeline.save_pretrained("./onnx-stable-diffusion-v1-5")
 ```
 
-If you want to export the pipeline in the ONNX format offline and later use it for inference,
-you can use the [`optimum-cli export`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) command: 
+<Tip warning={true}>
+
+Generating multiple prompts in a batch seems to take too much memory. While we look into it, you may need to iterate instead of batching.
+
+</Tip>
+
+To export the pipeline in the ONNX format offline and use it later for inference,
+use the [`optimum-cli export`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) command:
 
 ```bash
 optimum-cli export onnx --model runwayml/stable-diffusion-v1-5 sd_v15_onnx/
 ```
 
-Then perform inference:
+Then to perform inference (you don't have to specify `export=True` again):
 
 ```python 
 from optimum.onnxruntime import ORTStableDiffusionPipeline
@@ -57,36 +59,15 @@ prompt = "sailing ship in storm by Leonardo da Vinci"
 image = pipeline(prompt).images[0]
 ```
 
-Notice that we didn't have to specify `export=True` above.
-
 <div class="flex justify-center">
     <img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/onnxruntime/stable_diffusion_v1_5_ort_sail_boat.png">
 </div>
 
-You can find more examples in [optimum documentation](https://huggingface.co/docs/optimum/).
-
-
-### Supported tasks
-
-| Task                                 | Loading Class                        |
-|--------------------------------------|--------------------------------------|
-| `text-to-image`                      | `ORTStableDiffusionPipeline`         |
-| `image-to-image`                     | `ORTStableDiffusionImg2ImgPipeline`  |
-| `inpaint`                            | `ORTStableDiffusionInpaintPipeline`  |
+You can find more examples in 🤗 Optimum [documentation](https://huggingface.co/docs/optimum/), and Stable Diffusion is supported for text-to-image, image-to-image, and inpainting.
 
 ## Stable Diffusion XL
 
-### Export
-
-To export your model to ONNX, you can use the [Optimum CLI](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) as follows :
-
-```bash
-optimum-cli export onnx --model stabilityai/stable-diffusion-xl-base-1.0 --task stable-diffusion-xl sd_xl_onnx/
-```
-
-### Inference
-
-Here is an example of how you can load a SDXL ONNX model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and run inference with ONNX Runtime :
+To load and run inference with SDXL, use the [`~optimum.onnxruntime.ORTStableDiffusionXLPipeline`]:
 
 ```python
 from optimum.onnxruntime import ORTStableDiffusionXLPipeline
@@ -97,13 +78,10 @@ prompt = "sailing ship in storm by Leonardo da Vinci"
 image = pipeline(prompt).images[0]
 ```
 
-### Supported tasks
-
-| Task                                 | Loading Class                        |
-|--------------------------------------|--------------------------------------|
-| `text-to-image`                      | `ORTStableDiffusionXLPipeline`       |
-| `image-to-image`                     | `ORTStableDiffusionXLImg2ImgPipeline`|
+To export the pipeline in the ONNX format and use it later for inference, use the [`optimum-cli export`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) command:
 
-## Known Issues
+```bash
+optimum-cli export onnx --model stabilityai/stable-diffusion-xl-base-1.0 --task stable-diffusion-xl sd_xl_onnx/
+```
 
-- Generating multiple prompts in a batch seems to take too much memory. While we look into it, you may need to iterate instead of batching.
+SDXL in the ONNX format is supported for text-to-image and image-to-image.
diff --git a/docs/source/en/optimization/open_vino.md b/docs/source/en/optimization/open_vino.md
index a820fb62f949..606c2207bcda 100644
--- a/docs/source/en/optimization/open_vino.md
+++ b/docs/source/en/optimization/open_vino.md
@@ -11,26 +11,21 @@ specific language governing permissions and limitations under the License.
 -->
 
 
-# How to use OpenVINO for inference
+# OpenVINO
 
-🤗 [Optimum](https://github.com/huggingface/optimum-intel) provides Stable Diffusion pipelines compatible with OpenVINO. You can now easily perform inference with OpenVINO Runtime on a variety of Intel processors ([see](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html) the full list of supported devices).
+🤗 [Optimum](https://github.com/huggingface/optimum-intel) provides Stable Diffusion pipelines compatible with OpenVINO to perform inference on a variety of Intel processors (see the [full list]((https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html)) of supported devices).
 
-## Installation
-
-Install 🤗 Optimum Intel with the following command:
+You'll need to install 🤗 Optimum Intel with the `--upgrade-strategy eager` option to ensure [`optimum-intel`](https://github.com/huggingface/optimum-intel) is using the latest version:
 
 ```
 pip install --upgrade-strategy eager optimum["openvino"]
 ```
 
-The `--upgrade-strategy eager` option is needed to ensure [`optimum-intel`](https://github.com/huggingface/optimum-intel) is upgraded to its latest version.
-
+This guide will show you how to use the Stable Diffusion and Stable Diffusion XL (SDXL) pipelines with OpenVINO.
 
 ## Stable Diffusion
 
-### Inference
-
-To load an OpenVINO model and run inference with OpenVINO Runtime, you need to replace `StableDiffusionPipeline` with `OVStableDiffusionPipeline`. In case you want to load a PyTorch model and convert it to the OpenVINO format on-the-fly, you can set `export=True`.
+To load and run inference, use the [`~optimum.intel.OVStableDiffusionPipeline`]. If you want to load a PyTorch model and convert it to the OpenVINO format on-the-fly, set `export=True`:
 
 ```python
 from optimum.intel import OVStableDiffusionPipeline
@@ -44,7 +39,7 @@ image = pipeline(prompt).images[0]
 pipeline.save_pretrained("openvino-sd-v1-5")
 ```
 
-To further speed up inference, the model can be statically reshaped :
+To further speed-up inference, statically reshape the model. If you change any parameters such as the outputs height or width, you’ll need to statically reshape your model again.
 
 ```python
 # Define the shapes related to the inputs and desired outputs
@@ -62,30 +57,15 @@ image = pipeline(
     num_images_per_prompt=num_images,
 ).images[0]
 ```
-
-In case you want to change any parameters such as the outputs height or width, you’ll need to statically reshape your model once again.
-
 <div class="flex justify-center">
     <img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/stable_diffusion_v1_5_sail_boat_rembrandt.png">
 </div>
 
-
-### Supported tasks
-
-| Task                                 | Loading Class                        |
-|--------------------------------------|--------------------------------------|
-| `text-to-image`                      | `OVStableDiffusionPipeline`          |
-| `image-to-image`                     | `OVStableDiffusionImg2ImgPipeline`   |
-| `inpaint`                            | `OVStableDiffusionInpaintPipeline`   |
-
-You can find more examples in the optimum [documentation](https://huggingface.co/docs/optimum/intel/inference#stable-diffusion).
-
+You can find more examples in the 🤗 Optimum [documentation](https://huggingface.co/docs/optimum/intel/inference#stable-diffusion), and Stable Diffusion is supported for text-to-image, image-to-image, and inpainting.
 
 ## Stable Diffusion XL
 
-### Inference
-
-Here is an example of how you can load a SDXL OpenVINO model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and run inference with OpenVINO Runtime :
+To load and run inference with SDXL, use the [`~optimum.intel.OVStableDiffusionXLPipeline`]:
 
 ```python
 from optimum.intel import OVStableDiffusionXLPipeline
@@ -96,15 +76,6 @@ prompt = "sailing ship in storm by Rembrandt"
 image = pipeline(prompt).images[0]
 ```
 
-To further speed up inference, the model can be statically reshaped as showed above.
-You can find more examples in the optimum [documentation](https://huggingface.co/docs/optimum/intel/inference#stable-diffusion-xl).
-
-### Supported tasks
-
-| Task                                 | Loading Class                        |
-|--------------------------------------|--------------------------------------|
-| `text-to-image`                      | `OVStableDiffusionXLPipeline`        |
-| `image-to-image`                     | `OVStableDiffusionXLImg2ImgPipeline` |
-
-
+To further speed-up inference, [statically reshape](#stable-diffusion) the model as shown in the Stable Diffusion section.
 
+You can find more examples in the 🤗 Optimum [documentation](https://huggingface.co/docs/optimum/intel/inference#stable-diffusion-xl), and running SDXL in OpenVINO is supported for text-to-image and image-to-image.
diff --git a/docs/source/en/optimization/opt_overview.md b/docs/source/en/optimization/opt_overview.md
index 8d8386f85f43..1f809bb011ce 100644
--- a/docs/source/en/optimization/opt_overview.md
+++ b/docs/source/en/optimization/opt_overview.md
@@ -12,6 +12,6 @@ specific language governing permissions and limitations under the License.
 
 # Overview
 
-Generating high-quality outputs is computationally intensive, especially during each iterative step where you go from a noisy output to a less noisy output. One of 🧨 Diffuser's goal is to make this technology widely accessible to everyone, which includes enabling fast inference on consumer and specialized hardware. 
+Generating high-quality outputs is computationally intensive, especially during each iterative step where you go from a noisy output to a less noisy output. One of 🤗 Diffuser's goal is to make this technology widely accessible to everyone, which includes enabling fast inference on consumer and specialized hardware.
 
-This section will cover tips and tricks - like half-precision weights and sliced attention - for optimizing inference speed and reducing memory-consumption. You can also learn how to speed up your PyTorch code with [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) or [ONNX Runtime](https://onnxruntime.ai/docs/), and enable memory-efficient attention with [xFormers](https://facebookresearch.github.io/xformers/). There are also guides for running inference on specific hardware like Apple Silicon, and Intel or Habana processors.
\ No newline at end of file
+This section will cover tips and tricks - like half-precision weights and sliced attention - for optimizing inference speed and reducing memory-consumption. You'll also learn how to speed up your PyTorch code with [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) or [ONNX Runtime](https://onnxruntime.ai/docs/), and enable memory-efficient attention with [xFormers](https://facebookresearch.github.io/xformers/). There are also guides for running inference on specific hardware like Apple Silicon, and Intel or Habana processors.
\ No newline at end of file
diff --git a/docs/source/en/optimization/tome.md b/docs/source/en/optimization/tome.md
index c2158f539a65..66d69c6900cc 100644
--- a/docs/source/en/optimization/tome.md
+++ b/docs/source/en/optimization/tome.md
@@ -10,35 +10,39 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Token Merging
+# Token merging
 
-Token Merging (introduced in [Token Merging: Your ViT But Faster](https://arxiv.org/abs/2210.09461)) works by merging the redundant tokens / patches progressively in the forward pass of a Transformer-based network. It can speed up the inference latency of the underlying network.
+[Token merging](https://huggingface.co/papers/2303.17604) (ToMe) merges redundant tokens/patches progressively in the forward pass of a Transformer-based network which can speed-up the inference latency of [`StableDiffusionPipeline`].
 
-After Token Merging (ToMe) was released, the authors released [Token Merging for Fast Stable Diffusion](https://arxiv.org/abs/2303.17604), which introduced a version of ToMe which is more compatible with Stable Diffusion. We can use ToMe to gracefully speed up the inference latency of a [`DiffusionPipeline`]. This doc discusses how to apply ToMe to the [`StableDiffusionPipeline`], the expected speedups, and the qualitative aspects of using ToMe on the [`StableDiffusionPipeline`]. 
-
-## Using ToMe
-
-The authors of ToMe released a convenient Python library called [`tomesd`](https://github.com/dbolya/tomesd) that lets us apply ToMe to a [`DiffusionPipeline`] like so:
+You can use ToMe from the [`tomesd`](https://github.com/dbolya/tomesd) library with the [`apply_patch`](https://github.com/dbolya/tomesd?tab=readme-ov-file#usage) function:
 
 ```diff
 from diffusers import StableDiffusionPipeline
 import tomesd
 
 pipeline = StableDiffusionPipeline.from_pretrained(
-      "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
+      "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True,
 ).to("cuda")
 + tomesd.apply_patch(pipeline, ratio=0.5)
 
 image = pipeline("a photo of an astronaut riding a horse on mars").images[0]
 ```
 
-And that’s it! 
+The `apply_patch` function exposes a number of [arguments](https://github.com/dbolya/tomesd#usage) to help strike a balance between pipeline inference speed and the quality of the generated tokens. The most important argument is `ratio` which controls the number of tokens that are merged during the forward pass.
+
+As reported in the [paper](https://huggingface.co/papers/2303.17604), ToMe can greatly preserve the quality of the generated images while boosting inference speed. By increasing the `ratio`, you can speed-up inference even further, but at the cost of some degraded image quality.
+
+To test the quality of the generated images, we sampled a few prompts from [Parti Prompts](https://parti.research.google/) and performed inference with the [`StableDiffusionPipeline`] with the following settings:
+
+<div class="flex justify-center">
+      <img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/tome/tome_samples.png">
+</div>
 
-`tomesd.apply_patch()` exposes [a number of arguments](https://github.com/dbolya/tomesd#usage) to let us strike a balance between the pipeline inference speed and the quality of the generated tokens. Amongst those arguments, the most important one is `ratio`. `ratio` controls the number of tokens that will be merged during the forward pass. For more details on `tomesd`, please refer to the original repository https://github.com/dbolya/tomesd and [the paper](https://arxiv.org/abs/2303.17604). 
+We didn’t notice any significant decrease in the quality of the generated samples, and you can check out the generated samples in this [WandB report](https://wandb.ai/sayakpaul/tomesd-results/runs/23j4bj3i?workspace=). If you're interested in reproducing this experiment, use this [script](https://gist.github.com/sayakpaul/8cac98d7f22399085a060992f411ecbd).
 
-## Benchmarking `tomesd` with `StableDiffusionPipeline`
+## Benchmarks
 
-We benchmarked the impact of using `tomesd` on [`StableDiffusionPipeline`] along with [xformers](https://huggingface.co/docs/diffusers/optimization/xformers) across different image resolutions. We used A100 and V100 as our test GPU devices with the following development environment (with Python 3.8.5):
+We also benchmarked the impact of `tomesd` on the [`StableDiffusionPipeline`] with [xFormers](https://huggingface.co/docs/diffusers/optimization/xformers) enabled across several image resolutions. The results are obtained from A100 and V100 GPUs in the following development environment:
 
 ```bash
 - `diffusers` version: 0.15.1
@@ -51,66 +55,35 @@ We benchmarked the impact of using `tomesd` on [`StableDiffusionPipeline`] along
 - tomesd version: 0.1.2
 ```
 
-We used this script for benchmarking: [https://gist.github.com/sayakpaul/27aec6bca7eb7b0e0aa4112205850335](https://gist.github.com/sayakpaul/27aec6bca7eb7b0e0aa4112205850335). Following are our findings: 
-
-### A100
-
-| Resolution | Batch size | Vanilla | ToMe | ToMe + xFormers | ToMe speedup (%) | ToMe + xFormers speedup (%) |
-| --- | --- | --- | --- | --- | --- | --- |
-| 512 | 10 | 6.88 | 5.26 | 4.69 | 23.54651163 | 31.83139535 |
-|  |  |  |  |  |  |  |
-| 768 | 10 | OOM | 14.71 | 11 |  |  |
-|  | 8 | OOM | 11.56 | 8.84 |  |  |
-|  | 4 | OOM | 5.98 | 4.66 |  |  |
-|  | 2 | 4.99 | 3.24 | 3.1 | 35.07014028 | 37.8757515 |
-|  | 1 | 3.29 | 2.24 | 2.03 | 31.91489362 | 38.29787234 |
-|  |  |  |  |  |  |  |
-| 1024 | 10 | OOM | OOM | OOM |  |  |
-|  | 8 | OOM | OOM | OOM |  |  |
-|  | 4 | OOM | 12.51 | 9.09 |  |  |
-|  | 2 | OOM | 6.52 | 4.96 |  |  |
-|  | 1 | 6.4 | 3.61 | 2.81 | 43.59375 | 56.09375 |
-
-***The timings reported here are in seconds. Speedups are calculated over the `Vanilla` timings.*** 
-
-### V100
-
-| Resolution | Batch size | Vanilla | ToMe | ToMe + xFormers | ToMe speedup (%) | ToMe + xFormers speedup (%) |
-| --- | --- | --- | --- | --- | --- | --- |
-| 512 | 10 | OOM | 10.03 | 9.29 |  |  |
-|  | 8 | OOM | 8.05 | 7.47 |  |  |
-|  | 4 | 5.7 | 4.3 | 3.98 | 24.56140351 | 30.1754386 |
-|  | 2 | 3.14 | 2.43 | 2.27 | 22.61146497 | 27.70700637 |
-|  | 1 | 1.88 | 1.57 | 1.57 | 16.4893617 | 16.4893617 |
-|  |  |  |  |  |  |  |
-| 768 | 10 | OOM | OOM | 23.67 |  |  |
-|  | 8 | OOM | OOM | 18.81 |  |  |
-|  | 4 | OOM | 11.81 | 9.7 |  |  |
-|  | 2 | OOM | 6.27 | 5.2 |  |  |
-|  | 1 | 5.43 | 3.38 | 2.82 | 37.75322284 | 48.06629834 |
-|  |  |  |  |  |  |  |
-| 1024 | 10 | OOM | OOM | OOM |  |  |
-|  | 8 | OOM | OOM | OOM |  |  |
-|  | 4 | OOM | OOM | 19.35 |  |  |
-|  | 2 | OOM | 13 | 10.78 |  |  |
-|  | 1 | OOM | 6.66 | 5.54 |  |  |
-
-As seen in the tables above, the speedup with `tomesd` becomes more pronounced for larger image resolutions. It is also interesting to note that with `tomesd`, it becomes possible to run the pipeline on a higher resolution, like 1024x1024. 
-
-It might be possible to speed up inference even further with [`torch.compile()`](https://huggingface.co/docs/diffusers/optimization/torch2.0). 
-
-## Quality
-
-As reported in [the paper](https://arxiv.org/abs/2303.17604), ToMe can preserve the quality of the generated images to a great extent while speeding up inference. By increasing the `ratio`, it is possible to further speed up inference, but that might come at the cost of a deterioration in the image quality. 
-
-To test the quality of the generated samples using our setup, we sampled a few prompts from the “Parti Prompts” (introduced in [Parti](https://parti.research.google/)) and performed inference with the [`StableDiffusionPipeline`] in the following settings:
-
-- Vanilla [`StableDiffusionPipeline`]
-- [`StableDiffusionPipeline`] + ToMe
-- [`StableDiffusionPipeline`] + ToMe + xformers
-
-We didn’t notice any significant decrease in the quality of the generated samples. Here are samples: 
-
-![tome-samples](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/tome/tome_samples.png)
-
-You can check out the generated samples [here](https://wandb.ai/sayakpaul/tomesd-results/runs/23j4bj3i?workspace=). We used [this script](https://gist.github.com/sayakpaul/8cac98d7f22399085a060992f411ecbd) for conducting this experiment.
\ No newline at end of file
+To reproduce this benchmark, feel free to use this [script](https://gist.github.com/sayakpaul/27aec6bca7eb7b0e0aa4112205850335). The results are reported in seconds, and where applicable we report the speed-up percentage over the vanilla pipeline when using ToMe and ToMe + xFormers.
+
+| **GPU**  | **Resolution** | **Batch size** | **Vanilla** | **ToMe**       | **ToMe + xFormers** |
+|----------|----------------|----------------|-------------|----------------|---------------------|
+| **A100** |            512 |             10 |        6.88 | 5.26 (+23.55%) |      4.69 (+31.83%) |
+|          |            768 |             10 |         OOM |          14.71 |                  11 |
+|          |                |              8 |         OOM |          11.56 |                8.84 |
+|          |                |              4 |         OOM |           5.98 |                4.66 |
+|          |                |              2 |        4.99 | 3.24 (+35.07%) |       2.1 (+37.88%) |
+|          |                |              1 |        3.29 | 2.24 (+31.91%) |       2.03 (+38.3%) |
+|          |           1024 |             10 |         OOM |            OOM |                 OOM |
+|          |                |              8 |         OOM |            OOM |                 OOM |
+|          |                |              4 |         OOM |          12.51 |                9.09 |
+|          |                |              2 |         OOM |           6.52 |                4.96 |
+|          |                |              1 |         6.4 | 3.61 (+43.59%) |      2.81 (+56.09%) |
+| **V100** |            512 |             10 |         OOM |          10.03 |                9.29 |
+|          |                |              8 |         OOM |           8.05 |                7.47 |
+|          |                |              4 |         5.7 |  4.3 (+24.56%) |      3.98 (+30.18%) |
+|          |                |              2 |        3.14 | 2.43 (+22.61%) |      2.27 (+27.71%) |
+|          |                |              1 |        1.88 | 1.57 (+16.49%) |      1.57 (+16.49%) |
+|          |            768 |             10 |         OOM |            OOM |               23.67 |
+|          |                |              8 |         OOM |            OOM |               18.81 |
+|          |                |              4 |         OOM |          11.81 |                 9.7 |
+|          |                |              2 |         OOM |           6.27 |                 5.2 |
+|          |                |              1 |        5.43 | 3.38 (+37.75%) |      2.82 (+48.07%) |
+|          |           1024 |             10 |         OOM |            OOM |                 OOM |
+|          |                |              8 |         OOM |            OOM |                 OOM |
+|          |                |              4 |         OOM |            OOM |               19.35 |
+|          |                |              2 |         OOM |             13 |               10.78 |
+|          |                |              1 |         OOM |           6.66 |                5.54 |
+
+As seen in the tables above, the speed-up from `tomesd` becomes more pronounced for larger image resolutions. It is also interesting to note that with `tomesd`, it is possible to run the pipeline on a higher resolution like 1024x1024. You may be able to speed-up inference even more with [`torch.compile`](torch2.0).
diff --git a/docs/source/en/optimization/torch2.0.md b/docs/source/en/optimization/torch2.0.md
index e1de432f6590..4984f385a3f5 100644
--- a/docs/source/en/optimization/torch2.0.md
+++ b/docs/source/en/optimization/torch2.0.md
@@ -10,96 +10,83 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Accelerated PyTorch 2.0 support in Diffusers
+# Torch 2.0
 
-Starting from version `0.13.0`, Diffusers supports the latest optimization from [PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/). These include:
-1. Support for accelerated transformers implementation with memory-efficient attention – no extra dependencies (such as `xformers`) required.
-2. [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) support for extra performance boost when individual models are compiled.
+🤗 Diffusers supports the latest optimizations from [PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/) which include:
 
+1. A memory-efficient attention implementation, scaled dot product attention, without requiring any extra dependencies such as xFormers.
+2. [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html), a just-in-time (JIT) compiler to provide an extra performance boost when individual models are compiled.
 
-## Installation
-
-To benefit from the accelerated attention implementation and `torch.compile()`, you just need to install the latest versions of PyTorch 2.0 from pip, and make sure you are on diffusers 0.13.0 or later. As explained below, diffusers automatically uses the optimized attention processor ([`AttnProcessor2_0`](https://github.com/huggingface/diffusers/blob/1a5797c6d4491a879ea5285c4efc377664e0332d/src/diffusers/models/attention_processor.py#L798)) (but not `torch.compile()`)
-when PyTorch 2.0 is available.
+Both of these optimizations require PyTorch 2.0 or later and 🤗 Diffusers > 0.13.0.
 
 ```bash
 pip install --upgrade torch diffusers
 ```
 
-## Using accelerated transformers and `torch.compile`.
-
-
-1. **Accelerated Transformers implementation**
-
-   PyTorch 2.0 includes an optimized and memory-efficient attention implementation through the [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) function, which automatically enables several optimizations depending on the inputs and the GPU type. This is similar to the `memory_efficient_attention` from [xFormers](https://github.com/facebookresearch/xformers), but built natively into PyTorch. 
+## Scaled dot product attention
 
-   These optimizations will be enabled by default in Diffusers if PyTorch 2.0 is installed and if `torch.nn.functional.scaled_dot_product_attention` is available. To use it, just install `torch 2.0` as suggested above and simply use the pipeline. For example:
+[`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) (SDPA) is an optimized and memory-efficient attention (similar to xFormers) that automatically enables several other optimizations depending on the model inputs and GPU type. SDPA is enabled by default if you're using PyTorch 2.0 and the latest version of 🤗 Diffusers, so you don't need to add anything to your code.
 
-    ```Python
-    import torch
-    from diffusers import DiffusionPipeline
+However, if you want to explicitly enable it, you can set a [`DiffusionPipeline`] to use [`~models.attention_processor.AttnProcessor2_0`]:
 
-    pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True)
-    pipe = pipe.to("cuda")
+```diff
+  import torch
+  from diffusers import DiffusionPipeline
++ from diffusers.models.attention_processor import AttnProcessor2_0
 
-    prompt = "a photo of an astronaut riding a horse on mars"
-    image = pipe(prompt).images[0]
-    ```
+  pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
++ pipe.unet.set_attn_processor(AttnProcessor2_0())
 
-    If you want to enable it explicitly (which is not required), you can do so as shown below.
-
-    ```diff
-    import torch
-    from diffusers import DiffusionPipeline
-    + from diffusers.models.attention_processor import AttnProcessor2_0
+  prompt = "a photo of an astronaut riding a horse on mars"
+  image = pipe(prompt).images[0]
+```
 
-    pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
-    + pipe.unet.set_attn_processor(AttnProcessor2_0())
+SDPA should be as fast and memory efficient as `xFormers`; check the [benchmark](#benchmark) for more details.
 
-    prompt = "a photo of an astronaut riding a horse on mars"
-    image = pipe(prompt).images[0]
-    ```
+In some cases - such as making the pipeline more deterministic or converting it to other formats - it may be helpful to use the vanilla attention processor, [`~models.attention_processor.AttnProcessor`]. To revert to [`~models.attention_processor.AttnProcessor`], call the [`~UNet2DConditionModel.set_default_attn_processor`] function on the pipeline:
 
-    This should be as fast and memory efficient as `xFormers`. More details [in our benchmark](#benchmark).
+```diff
+  import torch
+  from diffusers import DiffusionPipeline
+  from diffusers.models.attention_processor import AttnProcessor
 
-    It is possible to revert to the vanilla attention processor ([`AttnProcessor`](https://github.com/huggingface/diffusers/blob/1a5797c6d4491a879ea5285c4efc377664e0332d/src/diffusers/models/attention_processor.py#L402)), which can be helpful to make the pipeline more deterministic, or if you need to convert a fine-tuned model to other formats such as [Core ML](https://huggingface.co/docs/diffusers/v0.16.0/en/optimization/coreml#how-to-run-stable-diffusion-with-core-ml). To use the normal attention processor you can use the [`~diffusers.UNet2DConditionModel.set_default_attn_processor`] function:
+  pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
++ pipe.unet.set_default_attn_processor()
 
-    ```Python
-    import torch
-    from diffusers import DiffusionPipeline
-    from diffusers.models.attention_processor import AttnProcessor
+  prompt = "a photo of an astronaut riding a horse on mars"
+  image = pipe(prompt).images[0]
+```
 
-    pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
-    pipe.unet.set_default_attn_processor()
+## torch.compile
 
-    prompt = "a photo of an astronaut riding a horse on mars"
-    image = pipe(prompt).images[0]
-    ```
+The `torch.compile` function can often provide an additional speed-up to your PyTorch code. In 🤗 Diffusers, it is usually best to wrap the UNet with `torch.compile` because it does most of the heavy lifting in the pipeline.
 
-2. **torch.compile**
+```python
+from diffusers import DiffusionPipeline
+import torch
 
-    To get an additional speedup, we can use the new `torch.compile` feature. Since the UNet of the pipeline is usually the most computationally expensive, we wrap the `unet` with `torch.compile` leaving rest of the sub-models (text encoder and VAE) as is. For more information and different options, refer to the 
-    [torch compile docs](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html).
+pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
+pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+images = pipe(prompt, num_inference_steps=steps, num_images_per_prompt=batch_size).images[0]
+```
 
-    ```python
-    pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-    images = pipe(prompt, num_inference_steps=steps, num_images_per_prompt=batch_size).images
-    ```
+Depending on GPU type, `torch.compile` can provide an *addtional speed-up* of **5-300x** on top of SDPA! If you're using more recent GPU architectures such as Ampere (A100, 3090), Ada (4090), and Hopper (H100), `torch.compile` is able to squeeze even more performance out of these GPUs.
 
-    Depending on the type of GPU, `compile()` can yield between **5% - 300%** of _additional speed-up_ over the accelerated transformer optimizations. Note, however, that compilation is able to squeeze more performance improvements in more recent GPU architectures such as Ampere (A100, 3090), Ada (4090) and Hopper (H100).
-    
-    Compilation takes some time to complete, so it is best suited for situations where you need to prepare your pipeline once and then perform the same type of inference operations multiple times. Calling the compiled pipeline on a different image size will re-trigger compilation which can be expensive.
+Compilation requires some time to complete, so it is best suited for situations where you prepare your pipeline once and then perform the same type of inference operations multiple times. For example, calling the compiled pipeline on a different image size triggers compilation again which can be expensive.
 
+For more information and different options about `torch.compile`, refer to the [`torch_compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) tutorial.
 
 ## Benchmark
 
-We conducted a comprehensive benchmark with PyTorch 2.0's efficient attention implementation and `torch.compile` across different GPUs and batch sizes for five of our most used pipelines. We used `diffusers 0.17.0.dev0`, which [makes sure `torch.compile()` is leveraged optimally](https://github.com/huggingface/diffusers/pull/3313).
+We conducted a comprehensive benchmark with PyTorch 2.0's efficient attention implementation and `torch.compile` across different GPUs and batch sizes for five of our most used pipelines. The code is benchmarked on 🤗 Diffusers v0.17.0.dev0 to optimize `torch.compile` usage (see [here](https://github.com/huggingface/diffusers/pull/3313) for more details).
 
-### Benchmarking code 
+Expand the dropdown below to find the code used to benchmark each pipeline:
 
-#### Stable Diffusion text-to-image 
+<details>
 
-```python 
+### Stable Diffusion text-to-image
+
+```python
 from diffusers import DiffusionPipeline
 import torch
 
@@ -121,7 +108,7 @@ for _ in range(3):
     images = pipe(prompt=prompt).images
 ```
 
-#### Stable Diffusion image-to-image 
+### Stable Diffusion image-to-image
 
 ```python 
 from diffusers import StableDiffusionImg2ImgPipeline
@@ -154,7 +141,7 @@ for _ in range(3):
     image = pipe(prompt=prompt, image=init_image).images[0]
 ```
 
-#### Stable Diffusion - inpainting
+### Stable Diffusion inpainting
 
 ```python 
 from diffusers import StableDiffusionInpaintPipeline
@@ -194,7 +181,7 @@ for _ in range(3):
     image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
 ```
 
-#### ControlNet 
+### ControlNet
 
 ```python 
 from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
@@ -232,7 +219,7 @@ for _ in range(3):
     image = pipe(prompt=prompt, image=init_image).images[0]
 ```
 
-#### IF text-to-image + upscaling
+### DeepFloyd IF text-to-image + upscaling
 
 ```python 
 from diffusers import DiffusionPipeline
@@ -267,24 +254,18 @@ for _ in range(3):
     image_2 = pipe_2(image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=neg_prompt_embeds, output_type="pt").images
     image_3 = pipe_3(prompt=prompt, image=image, noise_level=100).images
 ```
+</details>
 
-To give you a pictorial overview of the possible speed-ups that can be obtained with PyTorch 2.0 and `torch.compile()`,
-here is a plot that shows relative speed-ups for the [Stable Diffusion text-to-image pipeline](StableDiffusionPipeline) across five
-different GPU families (with a batch size of 4):
+The graph below highlights the relative speed-ups for the [`StableDiffusionPipeline`] across five GPU families with PyTorch 2.0 and `torch.compile` enabled. The benchmarks for the following graphs are measured in *number of iterations/second*.
 
 ![t2i_speedup](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/pt2_benchmarks/t2i_speedup.png)
 
-To give you an even better idea of how this speed-up holds for the other pipelines presented above, consider the following 
-plot that shows the benchmarking numbers from an A100 across three different batch sizes
-(with PyTorch 2.0 nightly and `torch.compile()`):
+To give you an even better idea of how this speed-up holds for the other pipelines, consider the following
+graph for an A100 with PyTorch 2.0 and `torch.compile`:
 
 ![a100_numbers](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/pt2_benchmarks/a100_numbers.png)
 
-_(Our benchmarking metric for the plots above is **number of iterations/second**)_
-
-But we reveal all the benchmarking numbers in the interest of transparency! 
-
-In the following tables, we report our findings in terms of the number of **_iterations processed per second_**. 
+In the following tables, we report our findings in terms of the *number of iterations/second*.
 
 ### A100 (batch size: 1)
 
@@ -438,7 +419,7 @@ In the following tables, we report our findings in terms of the number of **_ite
 
 ## Notes 
 
-* Follow [this PR](https://github.com/huggingface/diffusers/pull/3313) for more details on the environment used for conducting the benchmarks. 
-* For the IF pipeline and batch sizes > 1, we only used a batch size of >1 in the first IF pipeline for text-to-image generation and NOT for upscaling. So, that means the two upscaling pipelines received a batch size of 1. 
+* Follow this [PR](https://github.com/huggingface/diffusers/pull/3313) for more details on the environment used for conducting the benchmarks. 
+* For the DeepFloyd IF pipeline where batch sizes > 1, we only used a batch size of > 1 in the first IF pipeline for text-to-image generation and NOT for upscaling. That means the two upscaling pipelines received a batch size of 1.
 
 *Thanks to [Horace He](https://github.com/Chillee) from the PyTorch team for their support in improving our support of `torch.compile()` in Diffusers.*
\ No newline at end of file
diff --git a/docs/source/en/optimization/xformers.md b/docs/source/en/optimization/xformers.md
index ede074a59fa9..e5aa4d106ad2 100644
--- a/docs/source/en/optimization/xformers.md
+++ b/docs/source/en/optimization/xformers.md
@@ -10,11 +10,11 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Installing xFormers
+# xFormers
 
-We recommend the use of [xFormers](https://github.com/facebookresearch/xformers) for both inference and training. In our tests, the optimizations performed in the attention blocks allow for both faster speed and reduced memory consumption.
+We recommend [xFormers](https://github.com/facebookresearch/xformers) for both inference and training. In our tests, the optimizations performed in the attention blocks allow for both faster speed and reduced memory consumption.
 
-Starting from version `0.0.16` of xFormers, released on January 2023, installation can be easily performed using pre-built pip wheels:
+Install xFormers from `pip`:
 
 ```bash
 pip install xformers
@@ -22,14 +22,14 @@ pip install xformers
 
 <Tip>
 
-The xFormers PIP package requires the latest version of PyTorch (1.13.1 as of xFormers 0.0.16). If you need to use a previous version of PyTorch, then we recommend you install xFormers from source using [the project instructions](https://github.com/facebookresearch/xformers#installing-xformers).
+The xFormers `pip` package requires the latest version of PyTorch. If you need to use a previous version of PyTorch, then we recommend [installing xFormers from the source](https://github.com/facebookresearch/xformers#installing-xformers).
 
 </Tip>
 
-After xFormers is installed, you can use `enable_xformers_memory_efficient_attention()` for faster inference and reduced memory consumption, as discussed [here](fp16#memory-efficient-attention).
+After xFormers is installed, you can use `enable_xformers_memory_efficient_attention()` for faster inference and reduced memory consumption as shown in this [section](memory#memory-efficient-attention).
 
 <Tip warning={true}>
 
-According to [this issue](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212), xFormers `v0.0.16` cannot be used for training (fine-tune or Dreambooth) in some GPUs. If you observe that problem, please install a development version as indicated in that comment.
+According to this [issue](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212), xFormers `v0.0.16` cannot be used for training (fine-tune or DreamBooth) in some GPUs. If you observe this problem, please install a development version as indicated in the issue comments.
 
 </Tip>
diff --git a/docs/source/en/using-diffusers/stable_diffusion_jax_how_to.md b/docs/source/en/using-diffusers/stable_diffusion_jax_how_to.md
index 91d49a5f6748..d62ce0bf91bf 100644
--- a/docs/source/en/using-diffusers/stable_diffusion_jax_how_to.md
+++ b/docs/source/en/using-diffusers/stable_diffusion_jax_how_to.md
@@ -1,51 +1,41 @@
-# 🧨 Stable Diffusion in JAX / Flax !
+# JAX/Flax
 
 [[open-in-colab]]
 
-🤗 Hugging Face [Diffusers](https://github.com/huggingface/diffusers) supports Flax since version `0.5.1`! This allows for super fast inference on Google TPUs, such as those available in Colab, Kaggle or Google Cloud Platform.
+🤗 Diffusers supports Flax for super fast inference on Google TPUs, such as those available in Colab, Kaggle or Google Cloud Platform. This guide shows you how to run inference with Stable Diffusion using JAX/Flax.
 
-This notebook shows how to run inference using JAX / Flax. If you want more details about how Stable Diffusion works or want to run it in GPU, please refer to [this notebook](https://huggingface.co/docs/diffusers/stable_diffusion).
-
-First, make sure you are using a TPU backend. If you are running this notebook in Colab, select `Runtime` in the menu above, then select the option "Change runtime type" and then select `TPU` under the `Hardware accelerator` setting.
-
-Note that JAX is not exclusive to TPUs, but it shines on that hardware because each TPU server has 8 TPU accelerators working in parallel.
-
-## Setup
-
-First make sure diffusers is installed.
+Before you begin, make sure you have the necessary libraries installed:
 
 ```py
 # uncomment to install the necessary libraries in Colab
-#!pip install jax==0.3.25 jaxlib==0.3.25 flax transformers ftfy
-#!pip install diffusers
+#!pip install -q jax==0.3.25 jaxlib==0.3.25 flax transformers ftfy
+#!pip install -q diffusers
 ```
 
+You should also make sure you're using a TPU backend. While JAX does not run exclusively on TPUs, you'll get the best performance on a TPU because each server has 8 TPU accelerators working in parallel.
+
+If you are running this guide in Colab, select *Runtime* in the menu above, select the option *Change runtime type*, and then select *TPU* under the *Hardware accelerator* setting. Import JAX and quickly check whether you're using a TPU:
+
 ```python
+import jax
 import jax.tools.colab_tpu
-
 jax.tools.colab_tpu.setup_tpu()
-import jax
-```
 
-```python
 num_devices = jax.device_count()
 device_type = jax.devices()[0].device_kind
 
 print(f"Found {num_devices} JAX devices of type {device_type}.")
 assert (
-    "TPU" in device_type
-), "Available device is not a TPU, please select TPU from Edit > Notebook settings > Hardware accelerator"
-```
-
-```python out
-Found 8 JAX devices of type Cloud TPU.
+    "TPU" in device_type, 
+    "Available device is not a TPU, please select TPU from Edit > Notebook settings > Hardware accelerator"
+)
+"Found 8 JAX devices of type Cloud TPU."
 ```
 
-Then we import all the dependencies.
+Great, now you can import the rest of the dependencies you'll need:
 
 ```python
 import numpy as np
-import jax
 import jax.numpy as jnp
 
 from pathlib import Path
@@ -58,17 +48,12 @@ from huggingface_hub import notebook_login
 from diffusers import FlaxStableDiffusionPipeline
 ```
 
-## Model Loading
+## Load a model
 
-TPU devices support `bfloat16`, an efficient half-float type. We'll use it for our tests, but you can also use `float32` to use full precision instead.
+Flax is a functional framework, so models are stateless and parameters are stored outside of them. Loading a pretrained Flax pipeline returns *both* the pipeline and the model weights (or parameters). In this guide, you'll use `bfloat16`, a more efficient half-float type that is supported by TPUs (you can also use `float32` for full precision if you want).
 
 ```python
 dtype = jnp.bfloat16
-```
-
-Flax is a functional framework, so models are stateless and parameters are stored outside them. Loading the pre-trained Flax pipeline will return both the pipeline itself and the model weights (or parameters). We are using a `bf16` version of the weights, which leads to type warnings that you can safely ignore.
-
-```python
 pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
     "CompVis/stable-diffusion-v1-4",
     revision="bf16",
@@ -78,95 +63,87 @@ pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
 
 ## Inference
 
-Since TPUs usually have 8 devices working in parallel, we'll replicate our prompt as many times as devices we have. Then we'll perform inference on the 8 devices at once, each responsible for generating one image. Thus, we'll get 8 images in the same amount of time it takes for one chip to generate a single one.
+TPUs usually have 8 devices working in parallel, so let's use the same prompt for each device. This means you can perform inference on 8 devices at once, with each device generating one image. As a result, you'll get 8 images in the same amount of time it takes for one chip to generate a single image!
+
+<Tip>
+
+Learn more details in the [How does parallelization work?](#how-does-parallelization-work) section.
 
-After replicating the prompt, we obtain the tokenized text ids by invoking the `prepare_inputs` function of the pipeline. The length of the tokenized text is set to 77 tokens, as required by the configuration of the underlying CLIP Text model.
+</Tip>
+
+After replicating the prompt, get the tokenized text ids by calling the `prepare_inputs` function on the pipeline. The length of the tokenized text is set to 77 tokens as required by the configuration of the underlying CLIP text model.
 
 ```python
 prompt = "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of field, close up, split lighting, cinematic"
 prompt = [prompt] * jax.device_count()
 prompt_ids = pipeline.prepare_inputs(prompt)
 prompt_ids.shape
+"(8, 77)"
 ```
 
-```python out
-(8, 77)
-```
-
-### Replication and parallelization
-
-Model parameters and inputs have to be replicated across the 8 parallel devices we have. The parameters dictionary is replicated using `flax.jax_utils.replicate`, which traverses the dictionary and changes the shape of the weights so they are repeated 8 times. Arrays are replicated using `shard`.
+Model parameters and inputs have to be replicated across the 8 parallel devices. The parameters dictionary is replicated with [`flax.jax_utils.replicate`](https://flax.readthedocs.io/en/latest/api_reference/flax.jax_utils.html#flax.jax_utils.replicate) which traverses the dictionary and changes the shape of the weights so they are repeated 8 times. Arrays are replicated using `shard`.
 
 ```python
+# parameters
 p_params = replicate(params)
-```
 
-```python
+# arrays
 prompt_ids = shard(prompt_ids)
 prompt_ids.shape
+"(8, 1, 77)"
 ```
 
-```python out
-(8, 1, 77)
-```
-
-That shape means that each one of the `8` devices will receive as an input a `jnp` array with shape `(1, 77)`. `1` is therefore the batch size per device. In TPUs with sufficient memory, it could be larger than `1` if we wanted to generate multiple images (per chip) at once.
+This shape means each one of the 8 devices receives as an input a `jnp` array with shape `(1, 77)`, where `1` is the batch size per device. On TPUs with sufficient memory, you could have a batch size larger than `1` if you want to generate multiple images (per chip) at once.
 
-We are almost ready to generate images! We just need to create a random number generator to pass to the generation function. This is the standard procedure in Flax, which is very serious and opinionated about random numbers – all functions that deal with random numbers are expected to receive a generator. This ensures reproducibility, even when we are training across multiple distributed devices.
+Next, create a random number generator to pass to the generation function. This is standard procedure in Flax, which is very serious and opinionated about random numbers. All functions that deal with random numbers are expected to receive a generator to ensure reproducibility, even when you're training across multiple distributed devices.
 
-The helper function below uses a seed to initialize a random number generator. As long as we use the same seed, we'll get the exact same results. Feel free to use different seeds when exploring results later in the notebook.
+The helper function below uses a seed to initialize a random number generator. As long as you use the same seed, you'll get the exact same results. Feel free to use different seeds when exploring results later in the guide.
 
 ```python
 def create_key(seed=0):
     return jax.random.PRNGKey(seed)
 ```
 
-We obtain a rng and then "split" it 8 times so each device receives a different generator. Therefore, each device will create a different image, and the full process is reproducible.
+The helper function, or `rng`, is split 8 times so each device receives a different generator and generates a different image.
 
 ```python
 rng = create_key(0)
 rng = jax.random.split(rng, jax.device_count())
 ```
 
-JAX code can be compiled to an efficient representation that runs very fast. However, we need to ensure that all inputs have the same shape in subsequent calls; otherwise, JAX will have to recompile the code, and we wouldn't be able to take advantage of the optimized speed.
+To take advantage of JAX's optimized speed on a TPU, pass `jit=True` to the pipeline to compile the JAX code into an efficient representation and to ensure the model runs in parallel across the 8 devices.
 
-The Flax pipeline can compile the code for us if we pass `jit = True` as an argument. It will also ensure that the model runs in parallel in the 8 available devices.
+<Tip warning={true}>
 
-The first time we run the following cell it will take a long time to compile, but subequent calls (even with different inputs) will be much faster. For example, it took more than a minute to compile in a TPU v2-8 when I tested, but then it takes about **`7s`** for future inference runs.
+You need to ensure all your inputs have the same shape in subsequent calls, other JAX will need to recompile the code which is slower.
 
-```
-%%time
-images = pipeline(prompt_ids, p_params, rng, jit=True)[0]
-```
+</Tip>
 
-```python out
-CPU times: user 56.2 s, sys: 42.5 s, total: 1min 38s
-Wall time: 1min 29s
-```
+The first inference run takes more time because it needs to compile the code, but subsequent calls (even with different inputs) are much faster. For example, it took more than a minute to compile on a TPU v2-8, but then it takes about **7s** on a future inference run!
 
-The returned array has shape `(8, 1, 512, 512, 3)`. We reshape it to get rid of the second dimension and obtain 8 images of `512 × 512 × 3` and then convert them to PIL.
+```py
+%%time
+images = pipeline(prompt_ids, p_params, rng, jit=True)[0]
 
-```python
-images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
-images = pipeline.numpy_to_pil(images)
+"CPU times: user 56.2 s, sys: 42.5 s, total: 1min 38s"
+"Wall time: 1min 29s"
 ```
 
-### Visualization
+The returned array has shape `(8, 1, 512, 512, 3)` which should be reshaped to remove the second dimension and get 8 images of `512 × 512 × 3`. Then you can use the [`~utils.numpy_to_pil`] function to convert the arrays into images.
 
 ```python
 from diffusers import make_image_grid
 
+images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
+images = pipeline.numpy_to_pil(images)
 make_image_grid(images, 2, 4)
 ```
 
 ![img](https://huggingface.co/datasets/YiYiXu/test-doc-assets/resolve/main/stable_diffusion_jax_how_to_cell_38_output_0.jpeg)
 
-
 ## Using different prompts
 
-We don't have to replicate the _same_ prompt in all the devices. We can do whatever we want: generate 2 prompts 4 times each, or even generate 8 different prompts at once. Let's do that!
-
-First, we'll refactor the input preparation code into a handy function:
+You don't necessarily have to use the same prompt on all devices. For example, to generate 8 different prompts:
 
 ```python
 prompts = [
@@ -179,9 +156,7 @@ prompts = [
     "Armchair in the shape of an avocado",
     "Clown astronaut in space, with Earth in the background",
 ]
-```
 
-```python
 prompt_ids = pipeline.prepare_inputs(prompts)
 prompt_ids = shard(prompt_ids)
 
@@ -197,46 +172,41 @@ make_image_grid(images, 2, 4)
 
 ## How does parallelization work?
 
-We said before that the `diffusers` Flax pipeline automatically compiles the model and runs it in parallel on all available devices. We'll now briefly look inside that process to show how it works.
+The Flax pipeline in 🤗 Diffusers automatically compiles the model and runs it in parallel on all available devices. Let's take a closer look at how that process works.
 
-JAX parallelization can be done in multiple ways. The easiest one revolves around using the `jax.pmap` function to achieve single-program, multiple-data (SPMD) parallelization. It means we'll run several copies of the same code, each on different data inputs. More sophisticated approaches are possible, we invite you to go over the [JAX documentation](https://jax.readthedocs.io/en/latest/index.html) and the [`pjit` pages](https://jax.readthedocs.io/en/latest/jax-101/08-pjit.html?highlight=pjit) to explore this topic if you are interested!
+JAX parallelization can be done in multiple ways. The easiest one revolves around using the [`jax.pmap`](https://jax.readthedocs.io/en/latest/_autosummary/jax.pmap.html) function to achieve single-program multiple-data (SPMD) parallelization. It means running several copies of the same code, each on different data inputs. More sophisticated approaches are possible, and you can go over to the JAX [documentation](https://jax.readthedocs.io/en/latest/index.html) to explore this topic in more detail if you are interested!
 
-`jax.pmap` does two things for us:
-- Compiles (or `jit`s) the code, as if we had invoked `jax.jit()`. This does not happen when we call `pmap`, but the first time the pmapped function is invoked.
-- Ensures the compiled code runs in parallel in all the available devices.
+`jax.pmap` does two things:
 
-To show how it works we `pmap` the `_generate` method of the pipeline, which is the private method that runs generates images. Please, note that this method may be renamed or removed in future releases of `diffusers`.
+1. Compiles (or "`jit`s") the code which is similar to `jax.jit()`. This does not happen when you call `pmap`, and only the first time the `pmap`ped function is called.
+2. Ensures the compiled code runs in parallel on all available devices.
+
+To demonstrate, call `pmap` on the pipeline's `_generate` method (this is a private method that generates images and may be renamed or removed in future releases of 🤗 Diffusers):
 
 ```python
 p_generate = pmap(pipeline._generate)
 ```
 
-After we use `pmap`, the prepared function `p_generate` will conceptually do the following:
-* Invoke a copy of the underlying function `pipeline._generate` in each device.
-* Send each device a different portion of the input arguments. That's what sharding is used for. In our case, `prompt_ids` has shape `(8, 1, 77, 768)`. This array will be split in `8` and each copy of `_generate` will receive an input with shape `(1, 77, 768)`.
+After calling `pmap`, the prepared function `p_generate` will:
 
-We can code `_generate` completely ignoring the fact that it will be invoked in parallel. We just care about our batch size (`1` in this example) and the dimensions that make sense for our code, and don't have to change anything to make it work in parallel.
+1. Make a copy of the underlying function, `pipeline._generate`, on each device.
+2. Send each device a different portion of the input arguments (this is why its necessary to call the *shard* function). In this case, `prompt_ids` has shape `(8, 1, 77, 768)` so the array is split into 8 and each copy of `_generate` receives an input with shape `(1, 77, 768)`.
 
-The same way as when we used the pipeline call, the first time we run the following cell it will take a while, but then it will be much faster.
+The most important thing to pay attention to here is the batch size (1 in this example), and the input dimensions that make sense for your code. You don't have to change anything else to make the code work in parallel.
 
-```
+The first time you call the pipeline takes more time, but the calls afterward are much faster. The `block_until_ready` function is used to correctly measure inference time because JAX uses asynchronous dispatch and returns control to the Python loop as soon as it can. You don't need to use that in your code; blocking occurs automatically when you want to use the result of a computation that has not yet been materialized.
+
+```py
 %%time
 images = p_generate(prompt_ids, p_params, rng)
 images = images.block_until_ready()
-images.shape
+"CPU times: user 1min 15s, sys: 18.2 s, total: 1min 34s"
+"Wall time: 1min 15s"
 ```
 
-```python out
-CPU times: user 1min 15s, sys: 18.2 s, total: 1min 34s
-Wall time: 1min 15s
-```
+Check your image dimensions to see if they're correct:
 
 ```python
 images.shape
-```
-
-```python out
-(8, 1, 512, 512, 3)
-```
-
-We use `block_until_ready()` to correctly measure inference time, because JAX uses asynchronous dispatch and returns control to the Python loop as soon as it can. You don't need to use that in your code; blocking will occur automatically when you want to use the result of a computation that has not yet been materialized.
\ No newline at end of file
+"(8, 1, 512, 512, 3)"
+```
\ No newline at end of file