From 1d686bac8146037e97f3fd8c56e4063230f71751 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 12 Dec 2023 11:03:34 +0530
Subject: [PATCH 01/30] [feat: Benchmarking Workflow] add stuff for a
 benchmarking workflow  (#5839)

* add poc for benchmarking workflow.

* import

* fix argument

* fix: argument

* fix: path

* fix

* fix

* path

* output csv files.

* workflow cleanup

* append token

* add utility to push to hf dataset

* fix: kw arg

* better reporting

* fix: headers

* better formatting of the numbers.

* better type annotation

* fix: formatting

* moentarily disable check

* push results.

* remove disable check

* introduce base classes.

* img2img class

* add inpainting pipeline

* intoduce base benchmark class.

* add img2img and inpainting

* feat: utility to compare changes

* fix

* fix import

* add args

* basepath

* better exception handling

* better path handling

* fix

* fix

* remove

* ifx

* fix

* add: support for controlnet.

* image_url -> url

* move images to huggingface hub

* correct urls.

* root_ckpt

* flush before benchmarking

* don't install accelerate from source

* add runner

* simplify Diffusers Benchmarking step

* change runner

* fix: subprocess call.

* filter percentage values

* fix controlnet benchmark

* add t2i adapters.

* fix filter columns

* fix t2i adapter benchmark

* fix init.

* fix

* remove safetensors flag

* fix args print

* fix

* feat: run_command

* add adapter resolution mapping

* benchmark t2i adapter fix.

* fix adapter input

* fix

* convert to L.

* add flush() add appropriate places

* better filtering

* okay

* get env for torch

* convert to float

* fix

* filter out nans.

* better coment

* sdxl

* sdxl for other benchmarks.

* fix: condition

* fix: condition for inpainting

* fix: mapping for resolution

* fix

* include kandinsky and wuerstchen

* fix: Wuerstchen

* Empty-Commit

* [Community] AnimateDiff + Controlnet Pipeline (#5928)

* begin work on animatediff + controlnet pipeline

* complete todos, uncomment multicontrolnet, input checks

Co-Authored-By: EdoardoBotta <botta.edoardo@gmail.com>

* update

Co-Authored-By: EdoardoBotta <botta.edoardo@gmail.com>

* add example

* update community README

* Update examples/community/README.md

---------

Co-authored-by: EdoardoBotta <botta.edoardo@gmail.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* EulerDiscreteScheduler add `rescale_betas_zero_snr` (#6024)

* EulerDiscreteScheduler add `rescale_betas_zero_snr`

* Revert "[Community] AnimateDiff + Controlnet Pipeline (#5928)"

This reverts commit 821726d7c0fba25f06ed8bba26984d9ccc014871.

* Revert "EulerDiscreteScheduler add `rescale_betas_zero_snr` (#6024)"

This reverts commit 3dc2362b5a89380f66ac006b1a787411fa1a9574.

* add SDXL turbo

* add lcm lora to the mix as well.

* fix

* increase steps to 2 when running turbo i2i

* debug

* debug

* debug

* fix for good

* fix and isolate better

* fuse lora so that torch compile works with peft

* fix: LCMLoRA

* better identification for LCM

* change to cron job

---------

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
Co-authored-by: Aryan V S <contact.aryanvs@gmail.com>
Co-authored-by: EdoardoBotta <botta.edoardo@gmail.com>
Co-authored-by: Beinsezii <39478211+Beinsezii@users.noreply.github.com>
---
 .github/workflows/benchmark.yml       |  52 +++++
 Makefile                              |   2 +-
 benchmarks/base_classes.py            | 297 ++++++++++++++++++++++++++
 benchmarks/benchmark_controlnet.py    |  26 +++
 benchmarks/benchmark_sd_img.py        |  29 +++
 benchmarks/benchmark_sd_inpainting.py |  28 +++
 benchmarks/benchmark_t2i_adapter.py   |  28 +++
 benchmarks/benchmark_t2i_lcm_lora.py  |  23 ++
 benchmarks/benchmark_text_to_image.py |  40 ++++
 benchmarks/push_results.py            |  72 +++++++
 benchmarks/run_all.py                 |  97 +++++++++
 benchmarks/utils.py                   |  98 +++++++++
 12 files changed, 791 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/benchmark.yml
 create mode 100644 benchmarks/base_classes.py
 create mode 100644 benchmarks/benchmark_controlnet.py
 create mode 100644 benchmarks/benchmark_sd_img.py
 create mode 100644 benchmarks/benchmark_sd_inpainting.py
 create mode 100644 benchmarks/benchmark_t2i_adapter.py
 create mode 100644 benchmarks/benchmark_t2i_lcm_lora.py
 create mode 100644 benchmarks/benchmark_text_to_image.py
 create mode 100644 benchmarks/push_results.py
 create mode 100644 benchmarks/run_all.py
 create mode 100644 benchmarks/utils.py

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 000000000000..c4c3c101dbfd
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,52 @@
+name: Benchmarking tests
+
+on:
+  schedule:
+    - cron: "30 1 1,15 * *" # every 2 weeks on the 1st and the 15th of every month at 1:30 AM
+
+env:
+  DIFFUSERS_IS_CI: yes
+  HF_HOME: /mnt/cache
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+
+jobs:
+  torch_pipelines_cuda_benchmark_tests:
+    name: Torch Core Pipelines CUDA Benchmarking Tests
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+    runs-on: [single-gpu, nvidia-gpu, a10, ci]
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+      - name: Install dependencies
+        run: |
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
+          python -m pip install -e .[quality,test]
+          python -m pip install pandas
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Diffusers Benchmarking
+        env:
+            HUGGING_FACE_HUB_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
+            BASE_PATH: benchmark_outputs
+        run: |
+          export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")
+          cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: benchmark_test_reports
+          path: benchmarks/benchmark_outputs
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 70bfced8c7b4..c92285b48c71 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src
 
-check_dirs := examples scripts src tests utils
+check_dirs := examples scripts src tests utils benchmarks
 
 modified_only_fixup:
 	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
new file mode 100644
index 000000000000..5d328f62b904
--- /dev/null
+++ b/benchmarks/base_classes.py
@@ -0,0 +1,297 @@
+import os
+import sys
+
+import torch
+
+from diffusers import (
+    AutoPipelineForImage2Image,
+    AutoPipelineForInpainting,
+    AutoPipelineForText2Image,
+    ControlNetModel,
+    LCMScheduler,
+    StableDiffusionAdapterPipeline,
+    StableDiffusionControlNetPipeline,
+    StableDiffusionXLAdapterPipeline,
+    StableDiffusionXLControlNetPipeline,
+    T2IAdapter,
+    WuerstchenCombinedPipeline,
+)
+from diffusers.utils import load_image
+
+
+sys.path.append(".")
+
+from utils import (  # noqa: E402
+    BASE_PATH,
+    PROMPT,
+    BenchmarkInfo,
+    benchmark_fn,
+    bytes_to_giga_bytes,
+    flush,
+    generate_csv_dict,
+    write_to_csv,
+)
+
+
+RESOLUTION_MAPPING = {
+    "runwayml/stable-diffusion-v1-5": (512, 512),
+    "lllyasviel/sd-controlnet-canny": (512, 512),
+    "diffusers/controlnet-canny-sdxl-1.0": (1024, 1024),
+    "TencentARC/t2iadapter_canny_sd14v1": (512, 512),
+    "TencentARC/t2i-adapter-canny-sdxl-1.0": (1024, 1024),
+    "stabilityai/stable-diffusion-2-1": (768, 768),
+    "stabilityai/stable-diffusion-xl-base-1.0": (1024, 1024),
+    "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024),
+    "stabilityai/sdxl-turbo": (512, 512),
+}
+
+
+class BaseBenchmak:
+    pipeline_class = None
+
+    def __init__(self, args):
+        super().__init__()
+
+    def run_inference(self, args):
+        raise NotImplementedError
+
+    def benchmark(self, args):
+        raise NotImplementedError
+
+    def get_result_filepath(self, args):
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        name = (
+            args.ckpt.replace("/", "_")
+            + "_"
+            + pipeline_class_name
+            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
+        )
+        filepath = os.path.join(BASE_PATH, name)
+        return filepath
+
+
+class TextToImageBenchmark(BaseBenchmak):
+    pipeline_class = AutoPipelineForText2Image
+
+    def __init__(self, args):
+        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
+        pipe = pipe.to("cuda")
+
+        if args.run_compile:
+            if not isinstance(pipe, WuerstchenCombinedPipeline):
+                pipe.unet.to(memory_format=torch.channels_last)
+                print("Run torch compile")
+                pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+                if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None:
+                    pipe.movq.to(memory_format=torch.channels_last)
+                    pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True)
+            else:
+                print("Run torch compile")
+                pipe.decoder = torch.compile(pipe.decoder, mode="reduce-overhead", fullgraph=True)
+                pipe.vqgan = torch.compile(pipe.vqgan, mode="reduce-overhead", fullgraph=True)
+
+        pipe.set_progress_bar_config(disable=True)
+        self.pipe = pipe
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+    def benchmark(self, args):
+        flush()
+
+        print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n")
+
+        time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
+        memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
+        benchmark_info = BenchmarkInfo(time=time, memory=memory)
+
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        flush()
+        csv_dict = generate_csv_dict(
+            pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info
+        )
+        filepath = self.get_result_filepath(args)
+        write_to_csv(filepath, csv_dict)
+        print(f"Logs written to: {filepath}")
+        flush()
+
+
+class TurboTextToImageBenchmark(TextToImageBenchmark):
+    def __init__(self, args):
+        super().__init__(args)
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+            guidance_scale=0.0,
+        )
+
+
+class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
+    lora_id = "latent-consistency/lcm-lora-sdxl"
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.pipe.load_lora_weights(self.lora_id)
+        self.pipe.fuse_lora()
+        self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
+
+    def get_result_filepath(self, args):
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        name = (
+            self.lora_id.replace("/", "_")
+            + "_"
+            + pipeline_class_name
+            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
+        )
+        filepath = os.path.join(BASE_PATH, name)
+        return filepath
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+            guidance_scale=1.0,
+        )
+
+
+class ImageToImageBenchmark(TextToImageBenchmark):
+    pipeline_class = AutoPipelineForImage2Image
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg"
+    image = load_image(url).convert("RGB")
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+
+class TurboImageToImageBenchmark(ImageToImageBenchmark):
+    def __init__(self, args):
+        super().__init__(args)
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+            guidance_scale=0.0,
+            strength=0.5,
+        )
+
+
+class InpaintingBenchmark(ImageToImageBenchmark):
+    pipeline_class = AutoPipelineForInpainting
+    mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png"
+    mask = load_image(mask_url).convert("RGB")
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+        self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt])
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            mask_image=self.mask,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+
+class ControlNetBenchmark(TextToImageBenchmark):
+    pipeline_class = StableDiffusionControlNetPipeline
+    aux_network_class = ControlNetModel
+    root_ckpt = "runwayml/stable-diffusion-v1-5"
+
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
+    image = load_image(url).convert("RGB")
+
+    def __init__(self, args):
+        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16)
+        pipe = pipe.to("cuda")
+
+        pipe.set_progress_bar_config(disable=True)
+        self.pipe = pipe
+
+        if args.run_compile:
+            pipe.unet.to(memory_format=torch.channels_last)
+            pipe.controlnet.to(memory_format=torch.channels_last)
+
+            print("Run torch compile")
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+            pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+
+class ControlNetSDXLBenchmark(ControlNetBenchmark):
+    pipeline_class = StableDiffusionXLControlNetPipeline
+    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+
+    def __init__(self, args):
+        super().__init__(args)
+
+
+class T2IAdapterBenchmark(ControlNetBenchmark):
+    pipeline_class = StableDiffusionAdapterPipeline
+    aux_network_class = T2IAdapter
+    root_ckpt = "CompVis/stable-diffusion-v1-4"
+
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png"
+    image = load_image(url).convert("L")
+
+    def __init__(self, args):
+        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16)
+        pipe = pipe.to("cuda")
+
+        pipe.set_progress_bar_config(disable=True)
+        self.pipe = pipe
+
+        if args.run_compile:
+            pipe.unet.to(memory_format=torch.channels_last)
+            pipe.adapter.to(memory_format=torch.channels_last)
+
+            print("Run torch compile")
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+            pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True)
+
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+
+
+class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark):
+    pipeline_class = StableDiffusionXLAdapterPipeline
+    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter_sdxl.png"
+    image = load_image(url)
+
+    def __init__(self, args):
+        super().__init__(args)
diff --git a/benchmarks/benchmark_controlnet.py b/benchmarks/benchmark_controlnet.py
new file mode 100644
index 000000000000..9217004461dc
--- /dev/null
+++ b/benchmarks/benchmark_controlnet.py
@@ -0,0 +1,26 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="lllyasviel/sd-controlnet-canny",
+        choices=["lllyasviel/sd-controlnet-canny", "diffusers/controlnet-canny-sdxl-1.0"],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = (
+        ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args)
+    )
+    benchmark_pipe.benchmark(args)
diff --git a/benchmarks/benchmark_sd_img.py b/benchmarks/benchmark_sd_img.py
new file mode 100644
index 000000000000..491e7c9a65a9
--- /dev/null
+++ b/benchmarks/benchmark_sd_img.py
@@ -0,0 +1,29 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import ImageToImageBenchmark, TurboImageToImageBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="runwayml/stable-diffusion-v1-5",
+        choices=[
+            "runwayml/stable-diffusion-v1-5",
+            "stabilityai/stable-diffusion-2-1",
+            "stabilityai/stable-diffusion-xl-refiner-1.0",
+            "stabilityai/sdxl-turbo",
+        ],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = ImageToImageBenchmark(args) if "turbo" not in args.ckpt else TurboImageToImageBenchmark(args)
+    benchmark_pipe.benchmark(args)
diff --git a/benchmarks/benchmark_sd_inpainting.py b/benchmarks/benchmark_sd_inpainting.py
new file mode 100644
index 000000000000..8f36883e16f3
--- /dev/null
+++ b/benchmarks/benchmark_sd_inpainting.py
@@ -0,0 +1,28 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import InpaintingBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="runwayml/stable-diffusion-v1-5",
+        choices=[
+            "runwayml/stable-diffusion-v1-5",
+            "stabilityai/stable-diffusion-2-1",
+            "stabilityai/stable-diffusion-xl-base-1.0",
+        ],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = InpaintingBenchmark(args)
+    benchmark_pipe.benchmark(args)
diff --git a/benchmarks/benchmark_t2i_adapter.py b/benchmarks/benchmark_t2i_adapter.py
new file mode 100644
index 000000000000..44b04b470ea6
--- /dev/null
+++ b/benchmarks/benchmark_t2i_adapter.py
@@ -0,0 +1,28 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import T2IAdapterBenchmark, T2IAdapterSDXLBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="TencentARC/t2iadapter_canny_sd14v1",
+        choices=["TencentARC/t2iadapter_canny_sd14v1", "TencentARC/t2i-adapter-canny-sdxl-1.0"],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = (
+        T2IAdapterBenchmark(args)
+        if args.ckpt == "TencentARC/t2iadapter_canny_sd14v1"
+        else T2IAdapterSDXLBenchmark(args)
+    )
+    benchmark_pipe.benchmark(args)
diff --git a/benchmarks/benchmark_t2i_lcm_lora.py b/benchmarks/benchmark_t2i_lcm_lora.py
new file mode 100644
index 000000000000..957e0a463e28
--- /dev/null
+++ b/benchmarks/benchmark_t2i_lcm_lora.py
@@ -0,0 +1,23 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import LCMLoRATextToImageBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="stabilityai/stable-diffusion-xl-base-1.0",
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=4)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = LCMLoRATextToImageBenchmark(args)
+    benchmark_pipe.benchmark(args)
diff --git a/benchmarks/benchmark_text_to_image.py b/benchmarks/benchmark_text_to_image.py
new file mode 100644
index 000000000000..caa97b0c5e3b
--- /dev/null
+++ b/benchmarks/benchmark_text_to_image.py
@@ -0,0 +1,40 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark  # noqa: E402
+
+
+ALL_T2I_CKPTS = [
+    "runwayml/stable-diffusion-v1-5",
+    "segmind/SSD-1B",
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    "kandinsky-community/kandinsky-2-2-decoder",
+    "warp-ai/wuerstchen",
+    "stabilityai/sdxl-turbo",
+]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="runwayml/stable-diffusion-v1-5",
+        choices=ALL_T2I_CKPTS,
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_cls = None
+    if "turbo" in args.ckpt:
+        benchmark_cls = TurboTextToImageBenchmark
+    else:
+        benchmark_cls = TextToImageBenchmark
+
+    benchmark_pipe = benchmark_cls(args)
+    benchmark_pipe.benchmark(args)
diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
new file mode 100644
index 000000000000..962e07c6d74c
--- /dev/null
+++ b/benchmarks/push_results.py
@@ -0,0 +1,72 @@
+import glob
+import sys
+
+import pandas as pd
+from huggingface_hub import hf_hub_download, upload_file
+from huggingface_hub.utils._errors import EntryNotFoundError
+
+
+sys.path.append(".")
+from utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv  # noqa: E402
+
+
+def has_previous_benchmark() -> str:
+    csv_path = None
+    try:
+        csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILE)
+    except EntryNotFoundError:
+        csv_path = None
+    return csv_path
+
+
+def filter_float(value):
+    if isinstance(value, str):
+        return float(value.split()[0])
+    return value
+
+
+def push_to_hf_dataset():
+    all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv"))
+    collate_csv(all_csvs, FINAL_CSV_FILE)
+
+    # If there's an existing benchmark file, we should report the changes.
+    csv_path = has_previous_benchmark()
+    if csv_path is not None:
+        current_results = pd.read_csv(FINAL_CSV_FILE)
+        previous_results = pd.read_csv(csv_path)
+
+        numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns
+        numeric_columns = [
+            c for c in numeric_columns if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)"]
+        ]
+
+        for column in numeric_columns:
+            previous_results[column] = previous_results[column].map(lambda x: filter_float(x))
+
+            # Calculate the percentage change
+            current_results[column] = current_results[column].astype(float)
+            previous_results[column] = previous_results[column].astype(float)
+            percent_change = ((current_results[column] - previous_results[column]) / previous_results[column]) * 100
+
+            # Format the values with '+' or '-' sign and append to original values
+            current_results[column] = current_results[column].map(str) + percent_change.map(
+                lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)"
+            )
+            # There might be newly added rows. So, filter out the NaNs.
+            current_results[column] = current_results[column].map(lambda x: x.replace(" (nan%)", ""))
+
+        # Overwrite the current result file.
+        current_results.to_csv(FINAL_CSV_FILE, index=False)
+
+    commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results"
+    upload_file(
+        repo_id=REPO_ID,
+        path_in_repo=FINAL_CSV_FILE,
+        path_or_fileobj=FINAL_CSV_FILE,
+        repo_type="dataset",
+        commit_message=commit_message,
+    )
+
+
+if __name__ == "__main__":
+    push_to_hf_dataset()
diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
new file mode 100644
index 000000000000..c70fb2227383
--- /dev/null
+++ b/benchmarks/run_all.py
@@ -0,0 +1,97 @@
+import glob
+import subprocess
+import sys
+from typing import List
+
+
+sys.path.append(".")
+from benchmark_text_to_image import ALL_T2I_CKPTS  # noqa: E402
+
+
+PATTERN = "benchmark_*.py"
+
+
+class SubprocessCallException(Exception):
+    pass
+
+
+# Taken from `test_examples_utils.py`
+def run_command(command: List[str], return_stdout=False):
+    """
+    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
+    if an error occurred while running `command`
+    """
+    try:
+        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
+        if return_stdout:
+            if hasattr(output, "decode"):
+                output = output.decode("utf-8")
+            return output
+    except subprocess.CalledProcessError as e:
+        raise SubprocessCallException(
+            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
+        ) from e
+
+
+def main():
+    python_files = glob.glob(PATTERN)
+
+    for file in python_files:
+        print(f"****** Running file: {file} ******")
+
+        # Run with canonical settings.
+        if file != "benchmark_text_to_image.py":
+            command = f"python {file}"
+            run_command(command.split())
+
+            command += " --run_compile"
+            run_command(command.split())
+
+    # Run variants.
+    for file in python_files:
+        if file == "benchmark_text_to_image.py":
+            for ckpt in ALL_T2I_CKPTS:
+                command = f"python {file} --ckpt {ckpt}"
+
+                if "turbo" in ckpt:
+                    command += " --num_inference_steps 1"
+
+                run_command(command.split())
+
+                command += " --run_compile"
+                run_command(command.split())
+
+        elif file == "benchmark_sd_img.py":
+            for ckpt in ["stabilityai/stable-diffusion-xl-refiner-1.0", "stabilityai/sdxl-turbo"]:
+                command = f"python {file} --ckpt {ckpt}"
+
+                if ckpt == "stabilityai/sdxl-turbo":
+                    command += " --num_inference_steps 2"
+
+                run_command(command.split())
+                command += " --run_compile"
+                run_command(command.split())
+
+        elif file == "benchmark_sd_inpainting.py":
+            sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+            command = f"python {file} --ckpt {sdxl_ckpt}"
+            run_command(command.split())
+
+            command += " --run_compile"
+            run_command(command.split())
+
+        elif file in ["benchmark_controlnet.py", "benchmark_t2i_adapter.py"]:
+            sdxl_ckpt = (
+                "diffusers/controlnet-canny-sdxl-1.0"
+                if "controlnet" in file
+                else "TencentARC/t2i-adapter-canny-sdxl-1.0"
+            )
+            command = f"python {file} --ckpt {sdxl_ckpt}"
+            run_command(command.split())
+
+            command += " --run_compile"
+            run_command(command.split())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
new file mode 100644
index 000000000000..5fce920ac6c3
--- /dev/null
+++ b/benchmarks/utils.py
@@ -0,0 +1,98 @@
+import argparse
+import csv
+import gc
+import os
+from dataclasses import dataclass
+from typing import Dict, List, Union
+
+import torch
+import torch.utils.benchmark as benchmark
+
+
+GITHUB_SHA = os.getenv("GITHUB_SHA", None)
+BENCHMARK_FIELDS = [
+    "pipeline_cls",
+    "ckpt_id",
+    "batch_size",
+    "num_inference_steps",
+    "model_cpu_offload",
+    "run_compile",
+    "time (secs)",
+    "memory (gbs)",
+    "actual_gpu_memory (gbs)",
+    "github_sha",
+]
+
+PROMPT = "ghibli style, a fantasy landscape with castles"
+BASE_PATH = os.getenv("BASE_PATH", ".")
+TOTAL_GPU_MEMORY = float(os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3)))
+
+REPO_ID = "diffusers/benchmarks"
+FINAL_CSV_FILE = "collated_results.csv"
+
+
+@dataclass
+class BenchmarkInfo:
+    time: float
+    memory: float
+
+
+def flush():
+    """Wipes off memory."""
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+    torch.cuda.reset_peak_memory_stats()
+
+
+def bytes_to_giga_bytes(bytes):
+    return f"{(bytes / 1024 / 1024 / 1024):.3f}"
+
+
+def benchmark_fn(f, *args, **kwargs):
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)",
+        globals={"args": args, "kwargs": kwargs, "f": f},
+        num_threads=torch.get_num_threads(),
+    )
+    return f"{(t0.blocked_autorange().mean):.3f}"
+
+
+def generate_csv_dict(
+    pipeline_cls: str, ckpt: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo
+) -> Dict[str, Union[str, bool, float]]:
+    """Packs benchmarking data into a dictionary for latter serialization."""
+    data_dict = {
+        "pipeline_cls": pipeline_cls,
+        "ckpt_id": ckpt,
+        "batch_size": args.batch_size,
+        "num_inference_steps": args.num_inference_steps,
+        "model_cpu_offload": args.model_cpu_offload,
+        "run_compile": args.run_compile,
+        "time (secs)": benchmark_info.time,
+        "memory (gbs)": benchmark_info.memory,
+        "actual_gpu_memory (gbs)": f"{(TOTAL_GPU_MEMORY):.3f}",
+        "github_sha": GITHUB_SHA,
+    }
+    return data_dict
+
+
+def write_to_csv(file_name: str, data_dict: Dict[str, Union[str, bool, float]]):
+    """Serializes a dictionary into a CSV file."""
+    with open(file_name, mode="w", newline="") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=BENCHMARK_FIELDS)
+        writer.writeheader()
+        writer.writerow(data_dict)
+
+
+def collate_csv(input_files: List[str], output_file: str):
+    """Collates multiple identically structured CSVs into a single CSV file."""
+    with open(output_file, mode="w", newline="") as outfile:
+        writer = csv.DictWriter(outfile, fieldnames=BENCHMARK_FIELDS)
+        writer.writeheader()
+
+        for file in input_files:
+            with open(file, mode="r") as infile:
+                reader = csv.DictReader(infile)
+                for row in reader:
+                    writer.writerow(row)

From c46711e895bcec849fdcce69a7bc6864f023b6d1 Mon Sep 17 00:00:00 2001
From: Monohydroxides <75928535+Monohydroxides@users.noreply.github.com>
Date: Thu, 14 Dec 2023 23:17:20 +0800
Subject: [PATCH 02/30] [Community] Add SDE Drag pipeline (#6105)

* Add community pipeline: sde_drag.py

* Update README.md

* Update README.md

Update example code and visual example

* Update sde_drag.py

Update code example.
---
 examples/community/README.md   |  40 +++
 examples/community/sde_drag.py | 594 +++++++++++++++++++++++++++++++++
 2 files changed, 634 insertions(+)
 create mode 100644 examples/community/sde_drag.py

diff --git a/examples/community/README.md b/examples/community/README.md
index e121e68bc9ce..c8865adf78f7 100755
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -48,6 +48,7 @@ prompt-to-prompt | change parts of a prompt and retain image structure (see [pap
 |   Latent Consistency Pipeline                                                                                                    | Implementation of [Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference](https://arxiv.org/abs/2310.04378)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Latent Consistency Pipeline](#latent-consistency-pipeline)      | - |              [Simian Luo](https://github.com/luosiallen) |
 |   Latent Consistency Img2img Pipeline                                                                                                    | Img2img pipeline for Latent Consistency Models                                                                                                                                                                                                                                                                                                                                                                                                                                    | [Latent Consistency Img2Img Pipeline](#latent-consistency-img2img-pipeline)      | - |              [Logan Zoellner](https://github.com/nagolinc) |
 |   Latent Consistency Interpolation Pipeline                                                                                                    | Interpolate the latent space of Latent Consistency Models with multiple prompts                                                                                                                                                                                                                                                                                                                                                                                                                                    | [Latent Consistency Interpolation Pipeline](#latent-consistency-interpolation-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1pK3NrLWJSiJsBynLns1K1-IDTW9zbPvl?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
+| SDE Drag Pipeline                                                                                                                         | The pipeline supports drag editing of images using stochastic differential equations                                                                                                                                                                                                                                                                                                                                                                                                                | [SDE Drag Pipeline](#sde-drag-pipeline)                                                     | - | [NieShen](https://github.com/NieShenRuc) [Fengqi Zhu](https://github.com/Monohydroxides) |
 |   Regional Prompting Pipeline                                                                                               | Assign multiple prompts for different regions                                                                                                                                                                                                                                                                                                                                                    |  [Regional Prompting Pipeline](#regional-prompting-pipeline) | - | [hako-mikan](https://github.com/hako-mikan) |
 | LDM3D-sr (LDM3D upscaler)                                                                                                             | Upscale low resolution RGB and depth inputs to high resolution                                                                                                                                                                                                                                                                                                                                                                                                                              | [StableDiffusionUpscaleLDM3D Pipeline](https://github.com/estelleafl/diffusers/tree/ldm3d_upscaler_community/examples/community#stablediffusionupscaleldm3d-pipeline)                                                                             | -                                                                                                                                                                                                             |                                                        [Estelle Aflalo](https://github.com/estelleafl) |
 | AnimateDiff ControlNet Pipeline                                                                                                    | Combines AnimateDiff with precise motion control using ControlNets                                                                                                                                                                                                                                                                                                                                                                                                                                    | [AnimateDiff ControlNet Pipeline](#animatediff-controlnet-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SKboYeGjEQmQPWoFC0aLYpBlYdHXkvAu?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) and [Edoardo Botta](https://github.com/EdoardoBotta) |
@@ -2986,3 +2987,42 @@ def image_grid(imgs, save_path=None):
 image_grid(images, save_path="./outputs/")
 ```
  ![output_example](https://github.com/PRIS-CV/DemoFusion/blob/main/output_example.png)
+
+### SDE Drag pipeline
+
+This pipeline provides drag-and-drop image editing using stochastic differential equations. It enables image editing by inputting prompt, image, mask_image, source_points, and target_points.
+
+![SDE Drag Image](https://github.com/huggingface/diffusers/assets/75928535/bd54f52f-f002-4951-9934-b2a4592771a5)
+
+See [paper](https://arxiv.org/abs/2311.01410), [paper page](https://ml-gsai.github.io/SDE-Drag-demo/), [original repo](https://github.com/ML-GSAI/SDE-Drag) for more infomation.
+
+```py
+import PIL
+import torch
+from diffusers import DDIMScheduler, DiffusionPipeline
+
+# Load the pipeline
+model_path = "runwayml/stable-diffusion-v1-5"
+scheduler = DDIMScheduler.from_pretrained(model_path, subfolder="scheduler")
+pipe = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, custom_pipeline="sde_drag")
+pipe.to('cuda')
+
+# To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+# If not training LoRA, please avoid using torch.float16
+# pipe.to(torch.float16)
+
+# Provide prompt, image, mask image, and the starting and target points for drag editing.
+prompt = "prompt of the image"
+image = PIL.Image.open('/path/to/image')
+mask_image = PIL.Image.open('/path/to/mask_image')
+source_points = [[123, 456]]
+target_points = [[234, 567]]
+
+# train_lora is optional, and in most cases, using train_lora can better preserve consistency with the original image.
+pipe.train_lora(prompt, image)
+
+output = pipe(prompt, image, mask_image, source_points, target_points)
+output_image = PIL.Image.fromarray(output)
+output_image.save("./output.png")
+
+```
diff --git a/examples/community/sde_drag.py b/examples/community/sde_drag.py
new file mode 100644
index 000000000000..08e865b9c350
--- /dev/null
+++ b/examples/community/sde_drag.py
@@ -0,0 +1,594 @@
+import math
+import tempfile
+from typing import List, Optional
+
+import numpy as np
+import PIL.Image
+import torch
+from accelerate import Accelerator
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DiffusionPipeline, DPMSolverMultistepScheduler, UNet2DConditionModel
+from diffusers.loaders import AttnProcsLayers, LoraLoaderMixin
+from diffusers.models.attention_processor import (
+    AttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    LoRAAttnAddedKVProcessor,
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_0,
+    SlicedAttnAddedKVProcessor,
+)
+from diffusers.optimization import get_scheduler
+
+
+class SdeDragPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image drag-and-drop editing using stochastic differential equations: https://arxiv.org/abs/2311.01410.
+    Please refer to the [official repository](https://github.com/ML-GSAI/SDE-Drag) for more information.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Please use
+            [`DDIMScheduler`].
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DPMSolverMultistepScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: str,
+        image: PIL.Image.Image,
+        mask_image: PIL.Image.Image,
+        source_points: List[List[int]],
+        target_points: List[List[int]],
+        t0: Optional[float] = 0.6,
+        steps: Optional[int] = 200,
+        step_size: Optional[int] = 2,
+        image_scale: Optional[float] = 0.3,
+        adapt_radius: Optional[int] = 5,
+        min_lora_scale: Optional[float] = 0.5,
+        generator: Optional[torch.Generator] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for image editing.
+        Args:
+            prompt (`str`, *required*):
+                The prompt to guide the image editing.
+            image (`PIL.Image.Image`, *required*):
+                Which will be edited, parts of the image will be masked out with `mask_image` and edited
+                according to `prompt`.
+            mask_image (`PIL.Image.Image`, *required*):
+                To mask `image`. White pixels in the mask will be edited, while black pixels will be preserved.
+            source_points (`List[List[int]]`, *required*):
+                Used to mark the starting positions of drag editing in the image, with each pixel represented as a
+                `List[int]` of length 2.
+            target_points (`List[List[int]]`, *required*):
+                Used to mark the target positions of drag editing in the image, with each pixel represented as a
+                `List[int]` of length 2.
+            t0 (`float`, *optional*, defaults to 0.6):
+                The time parameter. Higher t0 improves the fidelity while lowering the faithfulness of the edited images
+                and vice versa.
+            steps (`int`, *optional*, defaults to 200):
+                The number of sampling iterations.
+            step_size (`int`, *optional*, defaults to 2):
+                The drag diatance of each drag step.
+            image_scale (`float`, *optional*, defaults to 0.3):
+                To avoid duplicating the content, use image_scale to perturbs the source.
+            adapt_radius (`int`, *optional*, defaults to 5):
+                The size of the region for copy and paste operations during each step of the drag process.
+            min_lora_scale (`float`, *optional*, defaults to 0.5):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+                min_lora_scale specifies the minimum LoRA scale during the image drag-editing process.
+            generator ('torch.Generator', *optional*, defaults to None):
+                To make generation deterministic(https://pytorch.org/docs/stable/generated/torch.Generator.html).
+        Examples:
+        ```py
+        >>> import PIL
+        >>> import torch
+        >>> from diffusers import DDIMScheduler, DiffusionPipeline
+
+        >>> # Load the pipeline
+        >>> model_path = "runwayml/stable-diffusion-v1-5"
+        >>> scheduler = DDIMScheduler.from_pretrained(model_path, subfolder="scheduler")
+        >>> pipe = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, custom_pipeline="sde_drag")
+        >>> pipe.to('cuda')
+
+        >>> # To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+        >>> # If not training LoRA, please avoid using torch.float16
+        >>> # pipe.to(torch.float16)
+
+        >>> # Provide prompt, image, mask image, and the starting and target points for drag editing.
+        >>> prompt = "prompt of the image"
+        >>> image = PIL.Image.open('/path/to/image')
+        >>> mask_image = PIL.Image.open('/path/to/mask_image')
+        >>> source_points = [[123, 456]]
+        >>> target_points = [[234, 567]]
+
+        >>> # train_lora is optional, and in most cases, using train_lora can better preserve consistency with the original image.
+        >>> pipe.train_lora(prompt, image)
+
+        >>> output = pipe(prompt, image, mask_image, source_points, target_points)
+        >>> output_image = PIL.Image.fromarray(output)
+        >>> output_image.save("./output.png")
+        ```
+        """
+
+        self.scheduler.set_timesteps(steps)
+
+        noise_scale = (1 - image_scale**2) ** (0.5)
+
+        text_embeddings = self._get_text_embed(prompt)
+        uncond_embeddings = self._get_text_embed([""])
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        latent = self._get_img_latent(image)
+
+        mask = mask_image.resize((latent.shape[3], latent.shape[2]))
+        mask = torch.tensor(np.array(mask))
+        mask = mask.unsqueeze(0).expand_as(latent).to(self.device)
+
+        source_points = torch.tensor(source_points).div(torch.tensor([8]), rounding_mode="trunc")
+        target_points = torch.tensor(target_points).div(torch.tensor([8]), rounding_mode="trunc")
+
+        distance = target_points - source_points
+        distance_norm_max = torch.norm(distance.float(), dim=1, keepdim=True).max()
+
+        if distance_norm_max <= step_size:
+            drag_num = 1
+        else:
+            drag_num = distance_norm_max.div(torch.tensor([step_size]), rounding_mode="trunc")
+            if (distance_norm_max / drag_num - step_size).abs() > (
+                distance_norm_max / (drag_num + 1) - step_size
+            ).abs():
+                drag_num += 1
+
+        latents = []
+        for i in tqdm(range(int(drag_num)), desc="SDE Drag"):
+            source_new = source_points + (i / drag_num * distance).to(torch.int)
+            target_new = source_points + ((i + 1) / drag_num * distance).to(torch.int)
+
+            latent, noises, hook_latents, lora_scales, cfg_scales = self._forward(
+                latent, steps, t0, min_lora_scale, text_embeddings, generator
+            )
+            latent = self._copy_and_paste(
+                latent,
+                source_new,
+                target_new,
+                adapt_radius,
+                latent.shape[2] - 1,
+                latent.shape[3] - 1,
+                image_scale,
+                noise_scale,
+                generator,
+            )
+            latent = self._backward(
+                latent, mask, steps, t0, noises, hook_latents, lora_scales, cfg_scales, text_embeddings, generator
+            )
+
+            latents.append(latent)
+
+        result_image = 1 / 0.18215 * latents[-1]
+
+        with torch.no_grad():
+            result_image = self.vae.decode(result_image).sample
+
+        result_image = (result_image / 2 + 0.5).clamp(0, 1)
+        result_image = result_image.cpu().permute(0, 2, 3, 1).numpy()[0]
+        result_image = (result_image * 255).astype(np.uint8)
+
+        return result_image
+
+    def train_lora(self, prompt, image, lora_step=100, lora_rank=16, generator=None):
+        accelerator = Accelerator(gradient_accumulation_steps=1, mixed_precision="fp16")
+
+        self.vae.requires_grad_(False)
+        self.text_encoder.requires_grad_(False)
+        self.unet.requires_grad_(False)
+
+        unet_lora_attn_procs = {}
+        for name, attn_processor in self.unet.attn_processors.items():
+            cross_attention_dim = None if name.endswith("attn1.processor") else self.unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = self.unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.unet.config.block_out_channels[block_id]
+            else:
+                raise NotImplementedError("name must start with up_blocks, mid_blocks, or down_blocks")
+
+            if isinstance(attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_0)):
+                lora_attn_processor_class = LoRAAttnAddedKVProcessor
+            else:
+                lora_attn_processor_class = (
+                    LoRAAttnProcessor2_0
+                    if hasattr(torch.nn.functional, "scaled_dot_product_attention")
+                    else LoRAAttnProcessor
+                )
+            unet_lora_attn_procs[name] = lora_attn_processor_class(
+                hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=lora_rank
+            )
+
+        self.unet.set_attn_processor(unet_lora_attn_procs)
+        unet_lora_layers = AttnProcsLayers(self.unet.attn_processors)
+        params_to_optimize = unet_lora_layers.parameters()
+
+        optimizer = torch.optim.AdamW(
+            params_to_optimize,
+            lr=2e-4,
+            betas=(0.9, 0.999),
+            weight_decay=1e-2,
+            eps=1e-08,
+        )
+
+        lr_scheduler = get_scheduler(
+            "constant",
+            optimizer=optimizer,
+            num_warmup_steps=0,
+            num_training_steps=lora_step,
+            num_cycles=1,
+            power=1.0,
+        )
+
+        unet_lora_layers = accelerator.prepare_model(unet_lora_layers)
+        optimizer = accelerator.prepare_optimizer(optimizer)
+        lr_scheduler = accelerator.prepare_scheduler(lr_scheduler)
+
+        with torch.no_grad():
+            text_inputs = self._tokenize_prompt(prompt, tokenizer_max_length=None)
+            text_embedding = self._encode_prompt(
+                text_inputs.input_ids, text_inputs.attention_mask, text_encoder_use_attention_mask=False
+            )
+
+        image_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+        image = image_transforms(image).to(self.device, dtype=self.vae.dtype)
+        image = image.unsqueeze(dim=0)
+        latents_dist = self.vae.encode(image).latent_dist
+
+        for _ in tqdm(range(lora_step), desc="Train LoRA"):
+            self.unet.train()
+            model_input = latents_dist.sample() * self.vae.config.scaling_factor
+
+            # Sample noise that we'll add to the latents
+            noise = torch.randn(
+                model_input.size(),
+                dtype=model_input.dtype,
+                layout=model_input.layout,
+                device=model_input.device,
+                generator=generator,
+            )
+            bsz, channels, height, width = model_input.shape
+
+            # Sample a random timestep for each image
+            timesteps = torch.randint(
+                0, self.scheduler.config.num_train_timesteps, (bsz,), device=model_input.device, generator=generator
+            )
+            timesteps = timesteps.long()
+
+            # Add noise to the model input according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_model_input = self.scheduler.add_noise(model_input, noise, timesteps)
+
+            # Predict the noise residual
+            model_pred = self.unet(noisy_model_input, timesteps, text_embedding).sample
+
+            # Get the target for loss depending on the prediction type
+            if self.scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif self.scheduler.config.prediction_type == "v_prediction":
+                target = self.scheduler.get_velocity(model_input, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {self.scheduler.config.prediction_type}")
+
+            loss = torch.nn.functional.mse_loss(model_pred.float(), target.float(), reduction="mean")
+            accelerator.backward(loss)
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+
+        with tempfile.TemporaryDirectory() as save_lora_dir:
+            LoraLoaderMixin.save_lora_weights(
+                save_directory=save_lora_dir,
+                unet_lora_layers=unet_lora_layers,
+                text_encoder_lora_layers=None,
+            )
+
+            self.unet.load_attn_procs(save_lora_dir)
+
+    def _tokenize_prompt(self, prompt, tokenizer_max_length=None):
+        if tokenizer_max_length is not None:
+            max_length = tokenizer_max_length
+        else:
+            max_length = self.tokenizer.model_max_length
+
+        text_inputs = self.tokenizer(
+            prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=max_length,
+            return_tensors="pt",
+        )
+
+        return text_inputs
+
+    def _encode_prompt(self, input_ids, attention_mask, text_encoder_use_attention_mask=False):
+        text_input_ids = input_ids.to(self.device)
+
+        if text_encoder_use_attention_mask:
+            attention_mask = attention_mask.to(self.device)
+        else:
+            attention_mask = None
+
+        prompt_embeds = self.text_encoder(
+            text_input_ids,
+            attention_mask=attention_mask,
+        )
+        prompt_embeds = prompt_embeds[0]
+
+        return prompt_embeds
+
+    @torch.no_grad()
+    def _get_text_embed(self, prompt):
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        return text_embeddings
+
+    def _copy_and_paste(
+        self, latent, source_new, target_new, adapt_radius, max_height, max_width, image_scale, noise_scale, generator
+    ):
+        def adaption_r(source, target, adapt_radius, max_height, max_width):
+            r_x_lower = min(adapt_radius, source[0], target[0])
+            r_x_upper = min(adapt_radius, max_width - source[0], max_width - target[0])
+            r_y_lower = min(adapt_radius, source[1], target[1])
+            r_y_upper = min(adapt_radius, max_height - source[1], max_height - target[1])
+            return r_x_lower, r_x_upper, r_y_lower, r_y_upper
+
+        for source_, target_ in zip(source_new, target_new):
+            r_x_lower, r_x_upper, r_y_lower, r_y_upper = adaption_r(
+                source_, target_, adapt_radius, max_height, max_width
+            )
+
+            source_feature = latent[
+                :, :, source_[1] - r_y_lower : source_[1] + r_y_upper, source_[0] - r_x_lower : source_[0] + r_x_upper
+            ].clone()
+
+            latent[
+                :, :, source_[1] - r_y_lower : source_[1] + r_y_upper, source_[0] - r_x_lower : source_[0] + r_x_upper
+            ] = image_scale * source_feature + noise_scale * torch.randn(
+                latent.shape[0],
+                4,
+                r_y_lower + r_y_upper,
+                r_x_lower + r_x_upper,
+                device=self.device,
+                generator=generator,
+            )
+
+            latent[
+                :, :, target_[1] - r_y_lower : target_[1] + r_y_upper, target_[0] - r_x_lower : target_[0] + r_x_upper
+            ] = source_feature * 1.1
+        return latent
+
+    @torch.no_grad()
+    def _get_img_latent(self, image, height=None, weight=None):
+        data = image.convert("RGB")
+        if height is not None:
+            data = data.resize((weight, height))
+        transform = transforms.ToTensor()
+        data = transform(data).unsqueeze(0)
+        data = (data * 2.0) - 1.0
+        data = data.to(self.device, dtype=self.vae.dtype)
+        latent = self.vae.encode(data).latent_dist.sample()
+        latent = 0.18215 * latent
+        return latent
+
+    @torch.no_grad()
+    def _get_eps(self, latent, timestep, guidance_scale, text_embeddings, lora_scale=None):
+        latent_model_input = torch.cat([latent] * 2) if guidance_scale > 1.0 else latent
+        text_embeddings = text_embeddings if guidance_scale > 1.0 else text_embeddings.chunk(2)[1]
+
+        cross_attention_kwargs = None if lora_scale is None else {"scale": lora_scale}
+
+        with torch.no_grad():
+            noise_pred = self.unet(
+                latent_model_input,
+                timestep,
+                encoder_hidden_states=text_embeddings,
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
+
+        if guidance_scale > 1.0:
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        elif guidance_scale == 1.0:
+            noise_pred_text = noise_pred
+            noise_pred_uncond = 0.0
+        else:
+            raise NotImplementedError(guidance_scale)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+        return noise_pred
+
+    def _forward_sde(
+        self, timestep, sample, guidance_scale, text_embeddings, steps, eta=1.0, lora_scale=None, generator=None
+    ):
+        num_train_timesteps = len(self.scheduler)
+        alphas_cumprod = self.scheduler.alphas_cumprod
+        initial_alpha_cumprod = torch.tensor(1.0)
+
+        prev_timestep = timestep + num_train_timesteps // steps
+
+        alpha_prod_t = alphas_cumprod[timestep] if timestep >= 0 else initial_alpha_cumprod
+        alpha_prod_t_prev = alphas_cumprod[prev_timestep]
+
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        x_prev = (alpha_prod_t_prev / alpha_prod_t) ** (0.5) * sample + (1 - alpha_prod_t_prev / alpha_prod_t) ** (
+            0.5
+        ) * torch.randn(
+            sample.size(), dtype=sample.dtype, layout=sample.layout, device=self.device, generator=generator
+        )
+        eps = self._get_eps(x_prev, prev_timestep, guidance_scale, text_embeddings, lora_scale)
+
+        sigma_t_prev = (
+            eta
+            * (1 - alpha_prod_t) ** (0.5)
+            * (1 - alpha_prod_t_prev / (1 - alpha_prod_t_prev) * (1 - alpha_prod_t) / alpha_prod_t) ** (0.5)
+        )
+
+        pred_original_sample = (x_prev - beta_prod_t_prev ** (0.5) * eps) / alpha_prod_t_prev ** (0.5)
+        pred_sample_direction_coeff = (1 - alpha_prod_t - sigma_t_prev**2) ** (0.5)
+
+        noise = (
+            sample - alpha_prod_t ** (0.5) * pred_original_sample - pred_sample_direction_coeff * eps
+        ) / sigma_t_prev
+
+        return x_prev, noise
+
+    def _sample(
+        self,
+        timestep,
+        sample,
+        guidance_scale,
+        text_embeddings,
+        steps,
+        sde=False,
+        noise=None,
+        eta=1.0,
+        lora_scale=None,
+        generator=None,
+    ):
+        num_train_timesteps = len(self.scheduler)
+        alphas_cumprod = self.scheduler.alphas_cumprod
+        final_alpha_cumprod = torch.tensor(1.0)
+
+        eps = self._get_eps(sample, timestep, guidance_scale, text_embeddings, lora_scale)
+
+        prev_timestep = timestep - num_train_timesteps // steps
+
+        alpha_prod_t = alphas_cumprod[timestep]
+        alpha_prod_t_prev = alphas_cumprod[prev_timestep] if prev_timestep >= 0 else final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        sigma_t = (
+            eta
+            * ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** (0.5)
+            * (1 - alpha_prod_t / alpha_prod_t_prev) ** (0.5)
+            if sde
+            else 0
+        )
+
+        pred_original_sample = (sample - beta_prod_t ** (0.5) * eps) / alpha_prod_t ** (0.5)
+        pred_sample_direction_coeff = (1 - alpha_prod_t_prev - sigma_t**2) ** (0.5)
+
+        noise = (
+            torch.randn(
+                sample.size(), dtype=sample.dtype, layout=sample.layout, device=self.device, generator=generator
+            )
+            if noise is None
+            else noise
+        )
+        latent = (
+            alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction_coeff * eps + sigma_t * noise
+        )
+
+        return latent
+
+    def _forward(self, latent, steps, t0, lora_scale_min, text_embeddings, generator):
+        def scale_schedule(begin, end, n, length, type="linear"):
+            if type == "constant":
+                return end
+            elif type == "linear":
+                return begin + (end - begin) * n / length
+            elif type == "cos":
+                factor = (1 - math.cos(n * math.pi / length)) / 2
+                return (1 - factor) * begin + factor * end
+            else:
+                raise NotImplementedError(type)
+
+        noises = []
+        latents = []
+        lora_scales = []
+        cfg_scales = []
+        latents.append(latent)
+        t0 = int(t0 * steps)
+        t_begin = steps - t0
+
+        length = len(self.scheduler.timesteps[t_begin - 1 : -1]) - 1
+        index = 1
+        for t in self.scheduler.timesteps[t_begin:].flip(dims=[0]):
+            lora_scale = scale_schedule(1, lora_scale_min, index, length, type="cos")
+            cfg_scale = scale_schedule(1, 3.0, index, length, type="linear")
+            latent, noise = self._forward_sde(
+                t, latent, cfg_scale, text_embeddings, steps, lora_scale=lora_scale, generator=generator
+            )
+
+            noises.append(noise)
+            latents.append(latent)
+            lora_scales.append(lora_scale)
+            cfg_scales.append(cfg_scale)
+            index += 1
+        return latent, noises, latents, lora_scales, cfg_scales
+
+    def _backward(
+        self, latent, mask, steps, t0, noises, hook_latents, lora_scales, cfg_scales, text_embeddings, generator
+    ):
+        t0 = int(t0 * steps)
+        t_begin = steps - t0
+
+        hook_latent = hook_latents.pop()
+        latent = torch.where(mask > 128, latent, hook_latent)
+        for t in self.scheduler.timesteps[t_begin - 1 : -1]:
+            latent = self._sample(
+                t,
+                latent,
+                cfg_scales.pop(),
+                text_embeddings,
+                steps,
+                sde=True,
+                noise=noises.pop(),
+                lora_scale=lora_scales.pop(),
+                generator=generator,
+            )
+            hook_latent = hook_latents.pop()
+            latent = torch.where(mask > 128, latent, hook_latent)
+        return latent

From 8ccc76ab3760cdb1ab60c7a344e16f118bb58adc Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Thu, 14 Dec 2023 09:19:37 -0800
Subject: [PATCH 03/30] [docs] IP-Adapter API doc (#6140)

add ip-adapter

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/_toctree.yml              |  2 ++
 docs/source/en/api/loaders/ip_adapter.md | 25 ++++++++++++++++++++++++
 2 files changed, 27 insertions(+)
 create mode 100644 docs/source/en/api/loaders/ip_adapter.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index e72f674bfbd7..eab726d0b616 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -198,6 +198,8 @@
       title: Outputs
     title: Main Classes
   - sections:
+    - local: api/loaders/ip_adapter
+      title: IP-Adapter
     - local: api/loaders/lora
       title: LoRA
     - local: api/loaders/single_file
diff --git a/docs/source/en/api/loaders/ip_adapter.md b/docs/source/en/api/loaders/ip_adapter.md
new file mode 100644
index 000000000000..f73e5167036c
--- /dev/null
+++ b/docs/source/en/api/loaders/ip_adapter.md
@@ -0,0 +1,25 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# IP-Adapter
+
+[IP-Adapter](https://hf.co/papers/2308.06721) is a lightweight adapter that enables prompting a diffusion model with an image. This method decouples the cross-attention layers of the image and text features. The image features are generated from an image encoder. Files generated from IP-Adapter are only ~100MBs.
+
+<Tip>
+
+Learn how to load an IP-Adapter checkpoint and image in the [IP-Adapter](../../using-diffusers/loading_adapters#ip-adapter) loading guide.
+
+</Tip>
+
+## IPAdapterMixin
+
+[[autodoc]] loaders.ip_adapter.IPAdapterMixin

From 56806cdbfd4613447385e8ba78da30d901abea4d Mon Sep 17 00:00:00 2001
From: Aryan V S <contact.aryanvs@gmail.com>
Date: Thu, 14 Dec 2023 22:50:30 +0530
Subject: [PATCH 04/30] Add missing subclass docs, Fix broken example in
 SD_safe (#6116)

* fix broken example in pipeline_stable_diffusion_safe

* fix typo in pipeline_stable_diffusion_pix2pix_zero

* add missing docs

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 .../pipelines/animatediff/pipeline_animatediff.py    |  6 ++++++
 .../pipelines/controlnet/pipeline_controlnet.py      |  3 +++
 .../controlnet/pipeline_controlnet_img2img.py        |  4 ++++
 .../controlnet/pipeline_controlnet_inpaint.py        |  3 +++
 .../controlnet/pipeline_controlnet_inpaint_sd_xl.py  | 10 ++++------
 .../controlnet/pipeline_controlnet_sd_xl.py          |  6 ++++--
 .../controlnet/pipeline_controlnet_sd_xl_img2img.py  |  7 ++++---
 .../controlnet_xs/pipeline_controlnet_xs.py          |  4 +++-
 .../controlnet_xs/pipeline_controlnet_xs_sd_xl.py    |  5 +++--
 .../stable_diffusion/pipeline_cycle_diffusion.py     |  5 +++++
 .../pipeline_stable_diffusion_attend_and_excite.py   |  3 +++
 .../pipeline_stable_diffusion_inpaint.py             |  1 +
 .../pipeline_stable_diffusion_k_diffusion.py         |  5 +++++
 .../pipeline_stable_diffusion_latent_upscale.py      |  3 +++
 .../pipeline_stable_diffusion_model_editing.py       |  5 +++++
 .../pipeline_stable_diffusion_pix2pix_zero.py        |  2 +-
 .../pipeline_stable_diffusion_upscale.py             |  6 ++++++
 .../stable_diffusion/pipeline_stable_unclip.py       |  5 +++++
 .../pipeline_stable_unclip_img2img.py                |  5 +++++
 .../pipeline_stable_diffusion_safe.py                |  3 ++-
 .../pipeline_stable_diffusion_xl.py                  | 12 ++++++------
 .../pipeline_stable_diffusion_xl_img2img.py          | 12 ++++++------
 .../pipeline_stable_diffusion_xl_inpaint.py          | 12 ++++++------
 .../pipeline_stable_diffusion_xl_instruct_pix2pix.py | 10 +++++-----
 .../pipeline_stable_diffusion_xl_adapter.py          |  6 ++++++
 .../pipeline_text_to_video_synth.py                  |  5 +++++
 .../pipeline_text_to_video_synth_img2img.py          |  5 +++++
 .../wuerstchen/pipeline_wuerstchen_prior.py          |  4 ++++
 28 files changed, 118 insertions(+), 39 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
index 32a08a0264bc..68b358f7645c 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
@@ -84,6 +84,12 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
 
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
index bf6ef2125446..3de6732be0f2 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -147,6 +147,9 @@ class StableDiffusionControlNetPipeline(
 
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
     Args:
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
index 037641bd820e..2083a6391ce7 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -140,7 +140,11 @@ class StableDiffusionControlNetImg2ImgPipeline(
 
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index 71e237ce4e02..270c232b698c 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -251,6 +251,9 @@ class StableDiffusionControlNetInpaintPipeline(
 
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
     <Tip>
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
index 0f51ad58a598..76b97b48f97c 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -148,12 +148,10 @@ class StableDiffusionXLControlNetInpaintPipeline(
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
-    In addition the pipeline inherits the following loading methods:
-        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
-        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
-
-    as well as the following saving methods:
-        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
 
     Args:
         vae ([`AutoencoderKL`]):
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
index 8c8399809228..0e7920708184 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -129,8 +129,10 @@ class StableDiffusionXLControlNetPipeline(
 
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
     Args:
         vae ([`AutoencoderKL`]):
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
index ba18567b60f7..cbe39f788518 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -155,9 +155,10 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
-    In addition the pipeline inherits the following loading methods:
-        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
-        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
 
     Args:
         vae ([`AutoencoderKL`]):
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
index 1cc18e879baa..bf3ac5050506 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
@@ -98,7 +98,9 @@ class StableDiffusionControlNetXSPipeline(
 
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
 
     Args:
         vae ([`AutoencoderKL`]):
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 59aee5d97d37..58f0f544a5ac 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -102,8 +102,9 @@ class StableDiffusionXLControlNetXSPipeline(
 
     The pipeline also inherits the following loading methods:
         - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
 
     Args:
         vae ([`AutoencoderKL`]):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index e5c2c78720d5..7b87c1065bd1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -143,6 +143,11 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
 
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
index 5950139fd6e1..78023f544ecf 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
@@ -177,6 +177,9 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
 
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 375197cc9e4d..820c2eecb864 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -232,6 +232,7 @@ class StableDiffusionInpaintPipeline(
         - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
 
     Args:
         vae ([`AutoencoderKL`, `AsymmetricAutoencoderKL`]):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
index 388e5a4b5ebd..5c472fad98ef 100755
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -54,6 +54,11 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
     <Tip warning={true}>
 
         This is an experimental pipeline and is likely to change in the future.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
index cfbbb7aaab72..aa20ddeb35c3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -67,6 +67,9 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, FromSingleFileMixi
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
 
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
index c6364891e445..5d1c2983d448 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
@@ -43,6 +43,11 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
 
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index df9849ead723..9b3e43480fb8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -282,7 +282,7 @@ def __call__(
 
 class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
     r"""
-    Pipeline for pixel-levl image editing using Pix2Pix Zero. Based on Stable Diffusion.
+    Pipeline for pixel-level image editing using Pix2Pix Zero. Based on Stable Diffusion.
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index ceb316331b38..c6db211cb733 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -76,6 +76,12 @@ class StableDiffusionUpscalePipeline(
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
 
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
index eb4542888c1f..6668238c0855 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
@@ -65,6 +65,11 @@ class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
 
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
     Args:
         prior_tokenizer ([`CLIPTokenizer`]):
             A [`CLIPTokenizer`].
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
index 73638fdd15da..543ef01b09f1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -76,6 +76,11 @@ class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
 
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
     Args:
         feature_extractor ([`CLIPImageProcessor`]):
             Feature extractor for image pre-processing before being encoded.
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
index fdc7844a7e08..7d5bc28cb88c 100644
--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -595,10 +595,11 @@ def __call__(
         ```py
         import torch
         from diffusers import StableDiffusionPipelineSafe
+        from diffusers.pipelines.stable_diffusion_safe import SafetyConfig
 
         pipeline = StableDiffusionPipelineSafe.from_pretrained(
             "AIML-TUDA/stable-diffusion-safe", torch_dtype=torch.float16
-        )
+        ).to("cuda")
         prompt = "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c. leyendecker"
         image = pipeline(prompt=prompt, **SafetyConfig.MEDIUM).images[0]
         ```
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index c8c6247960af..a6033b698a41 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -159,12 +159,12 @@ class StableDiffusionXLPipeline(
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
-    In addition the pipeline inherits the following loading methods:
-        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
-        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
-
-    as well as the following saving methods:
-        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
     Args:
         vae ([`AutoencoderKL`]):
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 729924ec2e20..23d8f97ffb52 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -176,12 +176,12 @@ class StableDiffusionXLImg2ImgPipeline(
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
-    In addition the pipeline inherits the following loading methods:
-        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
-        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
-
-    as well as the following saving methods:
-        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
     Args:
         vae ([`AutoencoderKL`]):
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 7195b5f2521a..b714bfa4bd12 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -321,12 +321,12 @@ class StableDiffusionXLInpaintPipeline(
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
-    In addition the pipeline inherits the following loading methods:
-        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
-        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
-
-    as well as the following saving methods:
-        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
 
     Args:
         vae ([`AutoencoderKL`]):
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
index 644948ddc0d3..b06363cffd69 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -126,11 +126,11 @@ class StableDiffusionXLInstructPix2PixPipeline(
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
-    In addition the pipeline inherits the following loading methods:
-        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
-
-    as well as the following saving methods:
-        - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
 
     Args:
         vae ([`AutoencoderKL`]):
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
index b07c98fef679..36fbf48d6493 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -178,6 +178,12 @@ class StableDiffusionXLAdapterPipeline(
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
     Args:
         adapter ([`T2IAdapter`] or [`MultiAdapter`] or `List[T2IAdapter]`):
             Provides additional conditioning to the unet during the denoising process. If you set multiple Adapter as a
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
index 1f6650f58d2e..ab5286a5e5b4 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -83,6 +83,11 @@ class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lora
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
 
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
index 6779a7b820c2..b19ccee660e2 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
@@ -159,6 +159,11 @@ class VideoToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
 
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
index 8047f159677a..1eff7c6ce8a3 100644
--- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
+++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
@@ -69,6 +69,10 @@ class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin):
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
     Args:
         prior ([`Prior`]):
             The canonical unCLIP prior to approximate the image embedding from the text embedding.

From 29dfe22a8e6f1ea1e1f6cd4fbb8381f08064091e Mon Sep 17 00:00:00 2001
From: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com>
Date: Thu, 14 Dec 2023 11:45:33 -0600
Subject: [PATCH 05/30] [advanced dreambooth lora sdxl training script] load
 pipeline for inference only if validation prompt is used (#6171)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* load pipeline for inference only if validation prompt is used

* move things outside

* load pipeline for inference only if validation prompt is used

* fix readme when validation prompt is used

---------

Co-authored-by: linoytsaban <linoy@huggingface.co>
Co-authored-by: apolinário <joaopaulo.passos@gmail.com>
---
 .../train_dreambooth_lora_sdxl_advanced.py    | 67 ++++++++++---------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
index a46a1afcc145..ad37363b7d30 100644
--- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
+++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
@@ -112,7 +112,7 @@ def save_model_card(
     repo_folder=None,
     vae_path=None,
 ):
-    img_str = "widget:\n" if images else ""
+    img_str = "widget:\n"
     for i, image in enumerate(images):
         image.save(os.path.join(repo_folder, f"image_{i}.png"))
         img_str += f"""
@@ -121,6 +121,10 @@ def save_model_card(
             url:
                 "image_{i}.png"
         """
+    if not images:
+        img_str += f"""
+        - text: '{instance_prompt}'
+        """
 
     trigger_str = f"You should use {instance_prompt} to trigger the image generation."
     diffusers_imports_pivotal = ""
@@ -157,8 +161,6 @@ def save_model_card(
 base_model: {base_model}
 instance_prompt: {instance_prompt}
 license: openrail++
-widget:
-    - text: '{validation_prompt if validation_prompt else instance_prompt}'
 ---
 """
 
@@ -2010,43 +2012,42 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
             text_encoder_lora_layers=text_encoder_lora_layers,
             text_encoder_2_lora_layers=text_encoder_2_lora_layers,
         )
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            # Final inference
+            # Load previous pipeline
+            vae = AutoencoderKL.from_pretrained(
+                vae_path,
+                subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+                revision=args.revision,
+                variant=args.variant,
+                torch_dtype=weight_dtype,
+            )
+            pipeline = StableDiffusionXLPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                vae=vae,
+                revision=args.revision,
+                variant=args.variant,
+                torch_dtype=weight_dtype,
+            )
 
-        # Final inference
-        # Load previous pipeline
-        vae = AutoencoderKL.from_pretrained(
-            vae_path,
-            subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
-            revision=args.revision,
-            variant=args.variant,
-            torch_dtype=weight_dtype,
-        )
-        pipeline = StableDiffusionXLPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            vae=vae,
-            revision=args.revision,
-            variant=args.variant,
-            torch_dtype=weight_dtype,
-        )
-
-        # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
-        scheduler_args = {}
+            # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+            scheduler_args = {}
 
-        if "variance_type" in pipeline.scheduler.config:
-            variance_type = pipeline.scheduler.config.variance_type
+            if "variance_type" in pipeline.scheduler.config:
+                variance_type = pipeline.scheduler.config.variance_type
 
-            if variance_type in ["learned", "learned_range"]:
-                variance_type = "fixed_small"
+                if variance_type in ["learned", "learned_range"]:
+                    variance_type = "fixed_small"
 
-            scheduler_args["variance_type"] = variance_type
+                scheduler_args["variance_type"] = variance_type
 
-        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+            pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
 
-        # load attention processors
-        pipeline.load_lora_weights(args.output_dir)
+            # load attention processors
+            pipeline.load_lora_weights(args.output_dir)
 
-        # run inference
-        images = []
-        if args.validation_prompt and args.num_validation_images > 0:
+            # run inference
             pipeline = pipeline.to(accelerator.device)
             generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
             images = [

From 1ccbfbb663399d3aa363af513e5a8352f3afdb35 Mon Sep 17 00:00:00 2001
From: Kuba <jakub.jaroszewski@gmail.com>
Date: Fri, 15 Dec 2023 01:55:43 +0100
Subject: [PATCH 06/30] [docs] Add missing `\` in lora.md (#6174)

---
 docs/source/en/training/lora.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/training/lora.md b/docs/source/en/training/lora.md
index 9ad088917dbc..b8ca21c4b3dc 100644
--- a/docs/source/en/training/lora.md
+++ b/docs/source/en/training/lora.md
@@ -179,7 +179,7 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image_lora.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
   --dataset_name=$DATASET_NAME \
   --dataloader_num_workers=8 \
-  --resolution=512 
+  --resolution=512 \
   --center_crop \
   --random_flip \
   --train_batch_size=1 \
@@ -214,4 +214,4 @@ image = pipeline("A pokemon with blue eyes").images[0]
 Congratulations on training a new model with LoRA! To learn more about how to use your new model, the following guides may be helpful:
 
 - Learn how to [load different LoRA formats](../using-diffusers/loading_adapters#LoRA) trained using community trainers like Kohya and TheLastBen.
-- Learn how to use and [combine multiple LoRA's](../tutorials/using_peft_for_inference) with PEFT for inference.
\ No newline at end of file
+- Learn how to use and [combine multiple LoRA's](../tutorials/using_peft_for_inference) with PEFT for inference.

From 4836cfad9836e6742a1d09462f85313534388a48 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 15 Dec 2023 03:13:18 +0100
Subject: [PATCH 07/30] [Sigmas] Keep sigmas on CPU (#6173)

* correct

* Apply suggestions from code review

* make style
---
 src/diffusers/schedulers/scheduling_consistency_models.py       | 2 ++
 src/diffusers/schedulers/scheduling_deis_multistep.py           | 2 ++
 src/diffusers/schedulers/scheduling_dpmsolver_multistep.py      | 2 ++
 .../schedulers/scheduling_dpmsolver_multistep_inverse.py        | 2 ++
 src/diffusers/schedulers/scheduling_dpmsolver_sde.py            | 2 ++
 src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py     | 2 ++
 src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py | 2 ++
 src/diffusers/schedulers/scheduling_euler_discrete.py           | 2 ++
 src/diffusers/schedulers/scheduling_heun_discrete.py            | 2 ++
 .../schedulers/scheduling_k_dpm_2_ancestral_discrete.py         | 2 ++
 src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py         | 2 ++
 src/diffusers/schedulers/scheduling_lms_discrete.py             | 2 ++
 src/diffusers/schedulers/scheduling_unipc_multistep.py          | 2 ++
 13 files changed, 26 insertions(+)

diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py
index 23cd3ec134b7..b9a21f9bbd37 100644
--- a/src/diffusers/schedulers/scheduling_consistency_models.py
+++ b/src/diffusers/schedulers/scheduling_consistency_models.py
@@ -98,6 +98,7 @@ def __init__(
         self.custom_timesteps = False
         self.is_scale_input_called = False
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     def index_for_timestep(self, timestep, schedule_timesteps=None):
         if schedule_timesteps is None:
@@ -230,6 +231,7 @@ def set_timesteps(
             self.timesteps = torch.from_numpy(timesteps).to(device=device)
 
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Modified _convert_to_karras implementation that takes in ramp as argument
     def _convert_to_karras(self, ramp):
diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py
index bd44d2444154..572078a9d604 100644
--- a/src/diffusers/schedulers/scheduling_deis_multistep.py
+++ b/src/diffusers/schedulers/scheduling_deis_multistep.py
@@ -187,6 +187,7 @@ def __init__(
         self.model_outputs = [None] * solver_order
         self.lower_order_nums = 0
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     @property
     def step_index(self):
@@ -254,6 +255,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         # add an index counter for schedulers that allow duplicated timesteps
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index 086505c5052b..49c07a504985 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -214,6 +214,7 @@ def __init__(
         self.model_outputs = [None] * solver_order
         self.lower_order_nums = 0
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     @property
     def step_index(self):
@@ -290,6 +291,7 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
 
         # add an index counter for schedulers that allow duplicated timesteps
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
index cfb53c943cea..5d8f3fdf49cd 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -209,6 +209,7 @@ def __init__(
         self.model_outputs = [None] * solver_order
         self.lower_order_nums = 0
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
         self.use_karras_sigmas = use_karras_sigmas
 
     @property
@@ -289,6 +290,7 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
 
         # add an index counter for schedulers that allow duplicated timesteps
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
index 12345a26bcf2..a999a8adbfa7 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -198,6 +198,7 @@ def __init__(
         self.noise_sampler = None
         self.noise_sampler_seed = noise_sampler_seed
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler.index_for_timestep
     def index_for_timestep(self, timestep, schedule_timesteps=None):
@@ -347,6 +348,7 @@ def set_timesteps(
         self.mid_point_sigma = None
 
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
         self.noise_sampler = None
 
         # for exp beta schedules, such as the one for `pipeline_shap_e.py`
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 7e8149ab55c4..dea033822e14 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -197,6 +197,7 @@ def __init__(
         self.sample = None
         self.order_list = self.get_order_list(num_train_timesteps)
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     def get_order_list(self, num_inference_steps: int) -> List[int]:
         """
@@ -288,6 +289,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         # add an index counter for schedulers that allow duplicated timesteps
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index 7c0dd803d91b..e476c329455e 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -166,6 +166,7 @@ def __init__(
         self.is_scale_input_called = False
 
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     @property
     def init_noise_sigma(self):
@@ -249,6 +250,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.timesteps = torch.from_numpy(timesteps).to(device=device)
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
     def _init_step_index(self, timestep):
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 802ba0f099f9..c72f7ff336aa 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -237,6 +237,7 @@ def __init__(
         self.use_karras_sigmas = use_karras_sigmas
 
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     @property
     def init_noise_sigma(self):
@@ -341,6 +342,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     def _sigma_to_t(self, sigma, log_sigmas):
         # get log sigma
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 460299cf2ec1..d06459e0a264 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -148,6 +148,7 @@ def __init__(
         self.use_karras_sigmas = use_karras_sigmas
 
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     def index_for_timestep(self, timestep, schedule_timesteps=None):
         if schedule_timesteps is None:
@@ -269,6 +270,7 @@ def set_timesteps(
         self.dt = None
 
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
         # (YiYi Notes: keep this for now since we are keeping add_noise function which use index_for_timestep)
         # for exp beta schedules, such as the one for `pipeline_shap_e.py`
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
index aae5a15abca2..dbf0984ed503 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -140,6 +140,7 @@ def __init__(
         #  set all values
         self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler.index_for_timestep
     def index_for_timestep(self, timestep, schedule_timesteps=None):
@@ -295,6 +296,7 @@ def set_timesteps(
         self._index_counter = defaultdict(int)
 
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
     def _sigma_to_t(self, sigma, log_sigmas):
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
index 3248520aa9a5..e1e5124d70e5 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -140,6 +140,7 @@ def __init__(
         self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
 
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_heun_discrete.HeunDiscreteScheduler.index_for_timestep
     def index_for_timestep(self, timestep, schedule_timesteps=None):
@@ -284,6 +285,7 @@ def set_timesteps(
         self._index_counter = defaultdict(int)
 
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     @property
     def state_in_first_order(self):
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 90e81c9b3c2c..a78fa0e42639 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -168,6 +168,7 @@ def __init__(
         self.is_scale_input_called = False
 
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     @property
     def init_noise_sigma(self):
@@ -279,6 +280,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self.sigmas = torch.from_numpy(sigmas).to(device=device)
         self.timesteps = torch.from_numpy(timesteps).to(device=device)
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
         self.derivatives = []
 
diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py
index eaa6273e2768..c147e0142a32 100644
--- a/src/diffusers/schedulers/scheduling_unipc_multistep.py
+++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py
@@ -198,6 +198,7 @@ def __init__(
         self.solver_p = solver_p
         self.last_sample = None
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     @property
     def step_index(self):
@@ -268,6 +269,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         # add an index counter for schedulers that allow duplicated timesteps
         self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:

From f5dfe2a8b0adc278e173d92b9080c792b413a832 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 15 Dec 2023 08:39:41 +0530
Subject: [PATCH 08/30] LoRA test fixes (#6163)

* update

* update

* update

* update

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 tests/lora/test_lora_layers_old_backend.py | 29 ++++++++++----
 tests/lora/test_lora_layers_peft.py        | 46 +++++++++++-----------
 2 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/tests/lora/test_lora_layers_old_backend.py b/tests/lora/test_lora_layers_old_backend.py
index 19505a1d906d..3d3b858fa0fd 100644
--- a/tests/lora/test_lora_layers_old_backend.py
+++ b/tests/lora/test_lora_layers_old_backend.py
@@ -343,6 +343,21 @@ def test_stable_diffusion_attn_processors(self):
         image = sd_pipe(**inputs).images
         assert image.shape == (1, 64, 64, 3)
 
+    @unittest.skipIf(not torch.cuda.is_available() or not is_xformers_available(), reason="xformers requires cuda")
+    def test_stable_diffusion_set_xformers_attn_processors(self):
+        # disable_full_determinism()
+        device = "cuda"  # ensure determinism for the device-dependent torch.Generator
+        components, _ = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        _, _, inputs = self.get_dummy_inputs()
+
+        # run normal sd pipe
+        image = sd_pipe(**inputs).images
+        assert image.shape == (1, 64, 64, 3)
+
         # run lora xformers attention
         attn_processors, _ = create_unet_lora_layers(sd_pipe.unet)
         attn_processors = {
@@ -607,7 +622,7 @@ def test_unload_lora_sd(self):
             orig_image_slice, orig_image_slice_two, atol=1e-3
         ), "Unloading LoRA parameters should lead to results similar to what was obtained with the pipeline without any LoRA parameters."
 
-    @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU")
+    @unittest.skipIf(torch_device != "cuda" or not is_xformers_available(), "This test is supposed to run on GPU")
     def test_lora_unet_attn_processors_with_xformers(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             self.create_lora_weight_file(tmpdirname)
@@ -644,7 +659,7 @@ def test_lora_unet_attn_processors_with_xformers(self):
                 if isinstance(module, Attention):
                     self.assertIsInstance(module.processor, XFormersAttnProcessor)
 
-    @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU")
+    @unittest.skipIf(torch_device != "cuda" or not is_xformers_available(), "This test is supposed to run on GPU")
     def test_lora_save_load_with_xformers(self):
         pipeline_components, lora_components = self.get_dummy_components()
         sd_pipe = StableDiffusionPipeline(**pipeline_components)
@@ -2270,8 +2285,8 @@ def test_sdxl_1_0_lora_fusion_efficiency(self):
         lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
         lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
 
-        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
-        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, torch_dtype=torch.float16)
         pipe.enable_model_cpu_offload()
 
         start_time = time.time()
@@ -2284,13 +2299,13 @@ def test_sdxl_1_0_lora_fusion_efficiency(self):
 
         del pipe
 
-        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
-        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, torch_dtype=torch.float16)
         pipe.fuse_lora()
         pipe.enable_model_cpu_offload()
 
-        start_time = time.time()
         generator = torch.Generator().manual_seed(0)
+        start_time = time.time()
         for _ in range(3):
             pipe(
                 "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
diff --git a/tests/lora/test_lora_layers_peft.py b/tests/lora/test_lora_layers_peft.py
index 48ae5d197273..6d3ac8b4592a 100644
--- a/tests/lora/test_lora_layers_peft.py
+++ b/tests/lora/test_lora_layers_peft.py
@@ -46,6 +46,7 @@
     floats_tensor,
     load_image,
     nightly,
+    numpy_cosine_similarity_distance,
     require_peft_backend,
     require_torch_gpu,
     slow,
@@ -1713,7 +1714,7 @@ def test_sdxl_0_9_lora_three(self):
         release_memory(pipe)
 
     def test_sdxl_1_0_lora(self):
-        generator = torch.Generator().manual_seed(0)
+        generator = torch.Generator("cpu").manual_seed(0)
 
         pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
         pipe.enable_model_cpu_offload()
@@ -1736,7 +1737,7 @@ def test_sdxl_lcm_lora(self):
         pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
         pipe.enable_model_cpu_offload()
 
-        generator = torch.Generator().manual_seed(0)
+        generator = torch.Generator("cpu").manual_seed(0)
 
         lora_model_id = "latent-consistency/lcm-lora-sdxl"
 
@@ -1753,7 +1754,8 @@ def test_sdxl_lcm_lora(self):
         image_np = pipe.image_processor.pil_to_numpy(image)
         expected_image_np = pipe.image_processor.pil_to_numpy(expected_image)
 
-        self.assertTrue(np.allclose(image_np, expected_image_np, atol=1e-2))
+        max_diff = numpy_cosine_similarity_distance(image_np.flatten(), expected_image_np.flatten())
+        assert max_diff < 1e-4
 
         pipe.unload_lora_weights()
 
@@ -1764,7 +1766,7 @@ def test_sdv1_5_lcm_lora(self):
         pipe.to("cuda")
         pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 
-        generator = torch.Generator().manual_seed(0)
+        generator = torch.Generator("cpu").manual_seed(0)
 
         lora_model_id = "latent-consistency/lcm-lora-sdv1-5"
         pipe.load_lora_weights(lora_model_id)
@@ -1780,7 +1782,8 @@ def test_sdv1_5_lcm_lora(self):
         image_np = pipe.image_processor.pil_to_numpy(image)
         expected_image_np = pipe.image_processor.pil_to_numpy(expected_image)
 
-        self.assertTrue(np.allclose(image_np, expected_image_np, atol=1e-2))
+        max_diff = numpy_cosine_similarity_distance(image_np.flatten(), expected_image_np.flatten())
+        assert max_diff < 1e-4
 
         pipe.unload_lora_weights()
 
@@ -1795,7 +1798,7 @@ def test_sdv1_5_lcm_lora_img2img(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape.png"
         )
 
-        generator = torch.Generator().manual_seed(0)
+        generator = torch.Generator("cpu").manual_seed(0)
 
         lora_model_id = "latent-consistency/lcm-lora-sdv1-5"
         pipe.load_lora_weights(lora_model_id)
@@ -1816,7 +1819,8 @@ def test_sdv1_5_lcm_lora_img2img(self):
         image_np = pipe.image_processor.pil_to_numpy(image)
         expected_image_np = pipe.image_processor.pil_to_numpy(expected_image)
 
-        self.assertTrue(np.allclose(image_np, expected_image_np, atol=1e-2))
+        max_diff = numpy_cosine_similarity_distance(image_np.flatten(), expected_image_np.flatten())
+        assert max_diff < 1e-4
 
         pipe.unload_lora_weights()
 
@@ -1849,7 +1853,7 @@ def test_sdxl_1_0_lora_fusion(self):
         release_memory(pipe)
 
     def test_sdxl_1_0_lora_unfusion(self):
-        generator = torch.Generator().manual_seed(0)
+        generator = torch.Generator("cpu").manual_seed(0)
 
         pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
         lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
@@ -1860,16 +1864,16 @@ def test_sdxl_1_0_lora_unfusion(self):
         pipe.enable_model_cpu_offload()
 
         images = pipe(
-            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=3
         ).images
-        images_with_fusion = images[0, -3:, -3:, -1].flatten()
+        images_with_fusion = images.flatten()
 
         pipe.unfuse_lora()
-        generator = torch.Generator().manual_seed(0)
+        generator = torch.Generator("cpu").manual_seed(0)
         images = pipe(
-            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=3
         ).images
-        images_without_fusion = images[0, -3:, -3:, -1].flatten()
+        images_without_fusion = images.flatten()
 
         self.assertTrue(np.allclose(images_with_fusion, images_without_fusion, atol=1e-3))
         release_memory(pipe)
@@ -1913,10 +1917,8 @@ def test_sdxl_1_0_lora_fusion_efficiency(self):
         lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
         lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
 
-        pipe = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
-        )
-        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, torch_dtype=torch.bfloat16)
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, torch_dtype=torch.float16)
         pipe.enable_model_cpu_offload()
 
         start_time = time.time()
@@ -1929,19 +1931,17 @@ def test_sdxl_1_0_lora_fusion_efficiency(self):
 
         del pipe
 
-        pipe = DiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
-        )
-        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, torch_dtype=torch.bfloat16)
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, torch_dtype=torch.float16)
         pipe.fuse_lora()
+
         # We need to unload the lora weights since in the previous API `fuse_lora` led to lora weights being
         # silently deleted - otherwise this will CPU OOM
         pipe.unload_lora_weights()
-
         pipe.enable_model_cpu_offload()
 
-        start_time = time.time()
         generator = torch.Generator().manual_seed(0)
+        start_time = time.time()
         for _ in range(3):
             pipe(
                 "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2

From 93ea26f272f69a0e27afaebc96b68a2221a7eda0 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 15 Dec 2023 08:39:59 +0530
Subject: [PATCH 09/30] Add PEFT to training deps (#6148)

add peft to training deps

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ddb2afd64c51..c0bfbd6efae5 100644
--- a/setup.py
+++ b/setup.py
@@ -204,7 +204,7 @@ def run(self):
 extras = {}
 extras["quality"] = deps_list("urllib3", "isort", "ruff", "hf-doc-builder")
 extras["docs"] = deps_list("hf-doc-builder")
-extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2")
+extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft")
 extras["test"] = deps_list(
     "compel",
     "GitPython",

From 49db233b35a118b800aabd6edf1bef967b316034 Mon Sep 17 00:00:00 2001
From: dg845 <58458699+dg845@users.noreply.github.com>
Date: Fri, 15 Dec 2023 04:48:16 -0800
Subject: [PATCH 10/30] Clean Up Comments in LCM(-LoRA) Distillation Scripts.
 (#6145)

* Clean up comments in LCM(-LoRA) distillation scripts.

* Calculate predicted source noise noise_pred correctly for all prediction_types.

* make style

* apply suggestions from review

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 .../train_lcm_distill_lora_sd_wds.py          | 127 +++++++++++-----
 .../train_lcm_distill_lora_sdxl_wds.py        | 114 ++++++++++-----
 .../train_lcm_distill_sd_wds.py               | 136 ++++++++++++------
 .../train_lcm_distill_sdxl_wds.py             | 120 +++++++++++-----
 4 files changed, 350 insertions(+), 147 deletions(-)

diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py b/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py
index c96733f0425e..05689b71fa04 100644
--- a/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py
+++ b/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py
@@ -156,7 +156,7 @@ def __call__(self, x):
             return False
 
 
-class Text2ImageDataset:
+class SDText2ImageDataset:
     def __init__(
         self,
         train_shards_path_or_url: Union[str, List[str]],
@@ -359,19 +359,43 @@ def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=
 
 
 # Compare LCMScheduler.step, Step 4
-def predicted_origin(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
     if prediction_type == "epsilon":
-        sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
-        alphas = extract_into_tensor(alphas, timesteps, sample.shape)
         pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
     elif prediction_type == "v_prediction":
-        pred_x_0 = alphas[timesteps] * sample - sigmas[timesteps] * model_output
+        pred_x_0 = alphas * sample - sigmas * model_output
     else:
-        raise ValueError(f"Prediction type {prediction_type} currently not supported.")
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
 
     return pred_x_0
 
 
+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+
+
 def extract_into_tensor(a, t, x_shape):
     b, *_ = t.shape
     out = a.gather(-1, t)
@@ -835,34 +859,35 @@ def main(args):
         args.pretrained_teacher_model, subfolder="scheduler", revision=args.teacher_revision
     )
 
-    # The scheduler calculates the alpha and sigma schedule for us
+    # DDPMScheduler calculates the alpha and sigma noise schedules (based on the alpha bars) for us
     alpha_schedule = torch.sqrt(noise_scheduler.alphas_cumprod)
     sigma_schedule = torch.sqrt(1 - noise_scheduler.alphas_cumprod)
+    # Initialize the DDIM ODE solver for distillation.
     solver = DDIMSolver(
         noise_scheduler.alphas_cumprod.numpy(),
         timesteps=noise_scheduler.config.num_train_timesteps,
         ddim_timesteps=args.num_ddim_timesteps,
     )
 
-    # 2. Load tokenizers from SD-XL checkpoint.
+    # 2. Load tokenizers from SD 1.X/2.X checkpoint.
     tokenizer = AutoTokenizer.from_pretrained(
         args.pretrained_teacher_model, subfolder="tokenizer", revision=args.teacher_revision, use_fast=False
     )
 
-    # 3. Load text encoders from SD-1.5 checkpoint.
+    # 3. Load text encoders from SD 1.X/2.X checkpoint.
     # import correct text encoder classes
     text_encoder = CLIPTextModel.from_pretrained(
         args.pretrained_teacher_model, subfolder="text_encoder", revision=args.teacher_revision
     )
 
-    # 4. Load VAE from SD-XL checkpoint (or more stable VAE)
+    # 4. Load VAE from SD 1.X/2.X checkpoint
     vae = AutoencoderKL.from_pretrained(
         args.pretrained_teacher_model,
         subfolder="vae",
         revision=args.teacher_revision,
     )
 
-    # 5. Load teacher U-Net from SD-XL checkpoint
+    # 5. Load teacher U-Net from SD 1.X/2.X checkpoint
     teacher_unet = UNet2DConditionModel.from_pretrained(
         args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
     )
@@ -872,7 +897,7 @@ def main(args):
     text_encoder.requires_grad_(False)
     teacher_unet.requires_grad_(False)
 
-    # 7. Create online (`unet`) student U-Nets.
+    # 7. Create online student U-Net.
     unet = UNet2DConditionModel.from_pretrained(
         args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
     )
@@ -935,6 +960,7 @@ def main(args):
     # Also move the alpha and sigma noise schedules to accelerator.device.
     alpha_schedule = alpha_schedule.to(accelerator.device)
     sigma_schedule = sigma_schedule.to(accelerator.device)
+    # Move the ODE solver to accelerator.device.
     solver = solver.to(accelerator.device)
 
     # 10. Handle saving and loading of checkpoints
@@ -1011,13 +1037,14 @@ def load_model_hook(models, input_dir):
         eps=args.adam_epsilon,
     )
 
+    # 13. Dataset creation and data processing
     # Here, we compute not just the text embeddings but also the additional embeddings
     # needed for the SD XL UNet to operate.
     def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tokenizer, is_train=True):
         prompt_embeds = encode_prompt(prompt_batch, text_encoder, tokenizer, proportion_empty_prompts, is_train)
         return {"prompt_embeds": prompt_embeds}
 
-    dataset = Text2ImageDataset(
+    dataset = SDText2ImageDataset(
         train_shards_path_or_url=args.train_shards_path_or_url,
         num_train_examples=args.max_train_samples,
         per_gpu_batch_size=args.train_batch_size,
@@ -1037,6 +1064,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
         tokenizer=tokenizer,
     )
 
+    # 14. LR Scheduler creation
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
     num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
@@ -1051,6 +1079,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
         num_training_steps=args.max_train_steps,
     )
 
+    # 15. Prepare for training
     # Prepare everything with our `accelerator`.
     unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
 
@@ -1072,7 +1101,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
     ).input_ids.to(accelerator.device)
     uncond_prompt_embeds = text_encoder(uncond_input_ids)[0]
 
-    # Train!
+    # 16. Train!
     total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
 
     logger.info("***** Running training *****")
@@ -1123,6 +1152,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
     for epoch in range(first_epoch, args.num_train_epochs):
         for step, batch in enumerate(train_dataloader):
             with accelerator.accumulate(unet):
+                # 1. Load and process the image and text conditioning
                 image, text = batch
 
                 image = image.to(accelerator.device, non_blocking=True)
@@ -1140,37 +1170,37 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
 
                 latents = latents * vae.config.scaling_factor
                 latents = latents.to(weight_dtype)
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
                 bsz = latents.shape[0]
 
-                # Sample a random timestep for each image t_n ~ U[0, N - k - 1] without bias.
+                # 2. Sample a random timestep for each image t_n from the ODE solver timesteps without bias.
+                # For the DDIM solver, the timestep schedule is [T - 1, T - k - 1, T - 2 * k - 1, ...]
                 topk = noise_scheduler.config.num_train_timesteps // args.num_ddim_timesteps
                 index = torch.randint(0, args.num_ddim_timesteps, (bsz,), device=latents.device).long()
                 start_timesteps = solver.ddim_timesteps[index]
                 timesteps = start_timesteps - topk
                 timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)
 
-                # 20.4.4. Get boundary scalings for start_timesteps and (end) timesteps.
+                # 3. Get boundary scalings for start_timesteps and (end) timesteps.
                 c_skip_start, c_out_start = scalings_for_boundary_conditions(start_timesteps)
                 c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
                 c_skip, c_out = scalings_for_boundary_conditions(timesteps)
                 c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]
 
-                # 20.4.5. Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                # 4. Sample noise from the prior and add it to the latents according to the noise magnitude at each
+                # timestep (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                noise = torch.randn_like(latents)
                 noisy_model_input = noise_scheduler.add_noise(latents, noise, start_timesteps)
 
-                # 20.4.6. Sample a random guidance scale w from U[w_min, w_max] and embed it
+                # 5. Sample a random guidance scale w from U[w_min, w_max]
+                # Note that for LCM-LoRA distillation it is not necessary to use a guidance scale embedding
                 w = (args.w_max - args.w_min) * torch.rand((bsz,)) + args.w_min
                 w = w.reshape(bsz, 1, 1, 1)
                 w = w.to(device=latents.device, dtype=latents.dtype)
 
-                # 20.4.8. Prepare prompt embeds and unet_added_conditions
+                # 6. Prepare prompt embeds and unet_added_conditions
                 prompt_embeds = encoded_text.pop("prompt_embeds")
 
-                # 20.4.9. Get online LCM prediction on z_{t_{n + k}}, w, c, t_{n + k}
+                # 7. Get online LCM prediction on z_{t_{n + k}} (noisy_model_input), w, c, t_{n + k} (start_timesteps)
                 noise_pred = unet(
                     noisy_model_input,
                     start_timesteps,
@@ -1179,7 +1209,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                     added_cond_kwargs=encoded_text,
                 ).sample
 
-                pred_x_0 = predicted_origin(
+                pred_x_0 = get_predicted_original_sample(
                     noise_pred,
                     start_timesteps,
                     noisy_model_input,
@@ -1190,17 +1220,27 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
 
                 model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0
 
-                # 20.4.10. Use the ODE solver to predict the kth step in the augmented PF-ODE trajectory after
-                # noisy_latents with both the conditioning embedding c and unconditional embedding 0
-                # Get teacher model prediction on noisy_latents and conditional embedding
+                # 8. Compute the conditional and unconditional teacher model predictions to get CFG estimates of the
+                # predicted noise eps_0 and predicted original sample x_0, then run the ODE solver using these
+                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
+                # solver timestep.
                 with torch.no_grad():
                     with torch.autocast("cuda"):
+                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                         cond_teacher_output = teacher_unet(
                             noisy_model_input.to(weight_dtype),
                             start_timesteps,
                             encoder_hidden_states=prompt_embeds.to(weight_dtype),
                         ).sample
-                        cond_pred_x0 = predicted_origin(
+                        cond_pred_x0 = get_predicted_original_sample(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        cond_pred_noise = get_predicted_noise(
                             cond_teacher_output,
                             start_timesteps,
                             noisy_model_input,
@@ -1209,13 +1249,21 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                             sigma_schedule,
                         )
 
-                        # Get teacher model prediction on noisy_latents and unconditional embedding
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
                         uncond_teacher_output = teacher_unet(
                             noisy_model_input.to(weight_dtype),
                             start_timesteps,
                             encoder_hidden_states=uncond_prompt_embeds.to(weight_dtype),
                         ).sample
-                        uncond_pred_x0 = predicted_origin(
+                        uncond_pred_x0 = get_predicted_original_sample(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        uncond_pred_noise = get_predicted_noise(
                             uncond_teacher_output,
                             start_timesteps,
                             noisy_model_input,
@@ -1224,12 +1272,17 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                             sigma_schedule,
                         )
 
-                        # 20.4.11. Perform "CFG" to get x_prev estimate (using the LCM paper's CFG formulation)
+                        # 3. Calculate the CFG estimate of x_0 (pred_x0) and eps_0 (pred_noise)
+                        # Note that this uses the LCM paper's CFG formulation rather than the Imagen CFG formulation
                         pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
-                        pred_noise = cond_teacher_output + w * (cond_teacher_output - uncond_teacher_output)
+                        pred_noise = cond_pred_noise + w * (cond_pred_noise - uncond_pred_noise)
+                        # 4. Run one step of the ODE solver to estimate the next point x_prev on the
+                        # augmented PF-ODE trajectory (solving backward in time)
+                        # Note that the DDIM step depends on both the predicted x_0 and source noise eps_0.
                         x_prev = solver.ddim_step(pred_x0, pred_noise, index)
 
-                # 20.4.12. Get target LCM prediction on x_prev, w, c, t_n
+                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
+                # Note that we do not use a separate target network for LCM-LoRA distillation.
                 with torch.no_grad():
                     with torch.autocast("cuda", dtype=weight_dtype):
                         target_noise_pred = unet(
@@ -1238,7 +1291,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                             timestep_cond=None,
                             encoder_hidden_states=prompt_embeds.float(),
                         ).sample
-                    pred_x_0 = predicted_origin(
+                    pred_x_0 = get_predicted_original_sample(
                         target_noise_pred,
                         timesteps,
                         x_prev,
@@ -1248,7 +1301,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                     )
                     target = c_skip * x_prev + c_out * pred_x_0
 
-                # 20.4.13. Calculate loss
+                # 10. Calculate loss
                 if args.loss_type == "l2":
                     loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                 elif args.loss_type == "huber":
@@ -1256,7 +1309,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                         torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
                     )
 
-                # 20.4.14. Backpropagate on the online student model (`unet`)
+                # 11. Backpropagate on the online student model (`unet`)
                 accelerator.backward(loss)
                 if accelerator.sync_gradients:
                     accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py b/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py
index 2ecd6f43dcde..014a770fa0ba 100644
--- a/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py
+++ b/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py
@@ -162,7 +162,7 @@ def __call__(self, x):
             return False
 
 
-class Text2ImageDataset:
+class SDXLText2ImageDataset:
     def __init__(
         self,
         train_shards_path_or_url: Union[str, List[str]],
@@ -346,19 +346,43 @@ def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=
 
 
 # Compare LCMScheduler.step, Step 4
-def predicted_origin(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
     if prediction_type == "epsilon":
-        sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
-        alphas = extract_into_tensor(alphas, timesteps, sample.shape)
         pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
     elif prediction_type == "v_prediction":
-        pred_x_0 = alphas[timesteps] * sample - sigmas[timesteps] * model_output
+        pred_x_0 = alphas * sample - sigmas * model_output
     else:
-        raise ValueError(f"Prediction type {prediction_type} currently not supported.")
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
 
     return pred_x_0
 
 
+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+
+
 def extract_into_tensor(a, t, x_shape):
     b, *_ = t.shape
     out = a.gather(-1, t)
@@ -830,9 +854,10 @@ def main(args):
         args.pretrained_teacher_model, subfolder="scheduler", revision=args.teacher_revision
     )
 
-    # The scheduler calculates the alpha and sigma schedule for us
+    # DDPMScheduler calculates the alpha and sigma noise schedules (based on the alpha bars) for us
     alpha_schedule = torch.sqrt(noise_scheduler.alphas_cumprod)
     sigma_schedule = torch.sqrt(1 - noise_scheduler.alphas_cumprod)
+    # Initialize the DDIM ODE solver for distillation.
     solver = DDIMSolver(
         noise_scheduler.alphas_cumprod.numpy(),
         timesteps=noise_scheduler.config.num_train_timesteps,
@@ -886,7 +911,7 @@ def main(args):
     text_encoder_two.requires_grad_(False)
     teacher_unet.requires_grad_(False)
 
-    # 7. Create online (`unet`) student U-Nets.
+    # 7. Create online student U-Net.
     unet = UNet2DConditionModel.from_pretrained(
         args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
     )
@@ -950,6 +975,7 @@ def main(args):
     # Also move the alpha and sigma noise schedules to accelerator.device.
     alpha_schedule = alpha_schedule.to(accelerator.device)
     sigma_schedule = sigma_schedule.to(accelerator.device)
+    # Move the ODE solver to accelerator.device.
     solver = solver.to(accelerator.device)
 
     # 10. Handle saving and loading of checkpoints
@@ -1057,7 +1083,7 @@ def compute_embeddings(
 
         return {"prompt_embeds": prompt_embeds, **unet_added_cond_kwargs}
 
-    dataset = Text2ImageDataset(
+    dataset = SDXLText2ImageDataset(
         train_shards_path_or_url=args.train_shards_path_or_url,
         num_train_examples=args.max_train_samples,
         per_gpu_batch_size=args.train_batch_size,
@@ -1175,6 +1201,7 @@ def compute_embeddings(
     for epoch in range(first_epoch, args.num_train_epochs):
         for step, batch in enumerate(train_dataloader):
             with accelerator.accumulate(unet):
+                # 1. Load and process the image, text, and micro-conditioning (original image size, crop coordinates)
                 image, text, orig_size, crop_coords = batch
 
                 image = image.to(accelerator.device, non_blocking=True)
@@ -1196,37 +1223,37 @@ def compute_embeddings(
                 latents = latents * vae.config.scaling_factor
                 if args.pretrained_vae_model_name_or_path is None:
                     latents = latents.to(weight_dtype)
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
                 bsz = latents.shape[0]
 
-                # Sample a random timestep for each image t_n ~ U[0, N - k - 1] without bias.
+                # 2. Sample a random timestep for each image t_n from the ODE solver timesteps without bias.
+                # For the DDIM solver, the timestep schedule is [T - 1, T - k - 1, T - 2 * k - 1, ...]
                 topk = noise_scheduler.config.num_train_timesteps // args.num_ddim_timesteps
                 index = torch.randint(0, args.num_ddim_timesteps, (bsz,), device=latents.device).long()
                 start_timesteps = solver.ddim_timesteps[index]
                 timesteps = start_timesteps - topk
                 timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)
 
-                # 20.4.4. Get boundary scalings for start_timesteps and (end) timesteps.
+                # 3. Get boundary scalings for start_timesteps and (end) timesteps.
                 c_skip_start, c_out_start = scalings_for_boundary_conditions(start_timesteps)
                 c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
                 c_skip, c_out = scalings_for_boundary_conditions(timesteps)
                 c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]
 
-                # 20.4.5. Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                # 4. Sample noise from the prior and add it to the latents according to the noise magnitude at each
+                # timestep (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                noise = torch.randn_like(latents)
                 noisy_model_input = noise_scheduler.add_noise(latents, noise, start_timesteps)
 
-                # 20.4.6. Sample a random guidance scale w from U[w_min, w_max] and embed it
+                # 5. Sample a random guidance scale w from U[w_min, w_max]
+                # Note that for LCM-LoRA distillation it is not necessary to use a guidance scale embedding
                 w = (args.w_max - args.w_min) * torch.rand((bsz,)) + args.w_min
                 w = w.reshape(bsz, 1, 1, 1)
                 w = w.to(device=latents.device, dtype=latents.dtype)
 
-                # 20.4.8. Prepare prompt embeds and unet_added_conditions
+                # 6. Prepare prompt embeds and unet_added_conditions
                 prompt_embeds = encoded_text.pop("prompt_embeds")
 
-                # 20.4.9. Get online LCM prediction on z_{t_{n + k}}, w, c, t_{n + k}
+                # 7. Get online LCM prediction on z_{t_{n + k}} (noisy_model_input), w, c, t_{n + k} (start_timesteps)
                 noise_pred = unet(
                     noisy_model_input,
                     start_timesteps,
@@ -1235,7 +1262,7 @@ def compute_embeddings(
                     added_cond_kwargs=encoded_text,
                 ).sample
 
-                pred_x_0 = predicted_origin(
+                pred_x_0 = get_predicted_original_sample(
                     noise_pred,
                     start_timesteps,
                     noisy_model_input,
@@ -1246,18 +1273,28 @@ def compute_embeddings(
 
                 model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0
 
-                # 20.4.10. Use the ODE solver to predict the kth step in the augmented PF-ODE trajectory after
-                # noisy_latents with both the conditioning embedding c and unconditional embedding 0
-                # Get teacher model prediction on noisy_latents and conditional embedding
+                # 8. Compute the conditional and unconditional teacher model predictions to get CFG estimates of the
+                # predicted noise eps_0 and predicted original sample x_0, then run the ODE solver using these
+                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
+                # solver timestep.
                 with torch.no_grad():
                     with torch.autocast("cuda"):
+                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                         cond_teacher_output = teacher_unet(
                             noisy_model_input.to(weight_dtype),
                             start_timesteps,
                             encoder_hidden_states=prompt_embeds.to(weight_dtype),
                             added_cond_kwargs={k: v.to(weight_dtype) for k, v in encoded_text.items()},
                         ).sample
-                        cond_pred_x0 = predicted_origin(
+                        cond_pred_x0 = get_predicted_original_sample(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        cond_pred_noise = get_predicted_noise(
                             cond_teacher_output,
                             start_timesteps,
                             noisy_model_input,
@@ -1266,7 +1303,7 @@ def compute_embeddings(
                             sigma_schedule,
                         )
 
-                        # Get teacher model prediction on noisy_latents and unconditional embedding
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
                         uncond_added_conditions = copy.deepcopy(encoded_text)
                         uncond_added_conditions["text_embeds"] = uncond_pooled_prompt_embeds
                         uncond_teacher_output = teacher_unet(
@@ -1275,7 +1312,15 @@ def compute_embeddings(
                             encoder_hidden_states=uncond_prompt_embeds.to(weight_dtype),
                             added_cond_kwargs={k: v.to(weight_dtype) for k, v in uncond_added_conditions.items()},
                         ).sample
-                        uncond_pred_x0 = predicted_origin(
+                        uncond_pred_x0 = get_predicted_original_sample(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        uncond_pred_noise = get_predicted_noise(
                             uncond_teacher_output,
                             start_timesteps,
                             noisy_model_input,
@@ -1284,12 +1329,17 @@ def compute_embeddings(
                             sigma_schedule,
                         )
 
-                        # 20.4.11. Perform "CFG" to get x_prev estimate (using the LCM paper's CFG formulation)
+                        # 3. Calculate the CFG estimate of x_0 (pred_x0) and eps_0 (pred_noise)
+                        # Note that this uses the LCM paper's CFG formulation rather than the Imagen CFG formulation
                         pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
-                        pred_noise = cond_teacher_output + w * (cond_teacher_output - uncond_teacher_output)
+                        pred_noise = cond_pred_noise + w * (cond_pred_noise - uncond_pred_noise)
+                        # 4. Run one step of the ODE solver to estimate the next point x_prev on the
+                        # augmented PF-ODE trajectory (solving backward in time)
+                        # Note that the DDIM step depends on both the predicted x_0 and source noise eps_0.
                         x_prev = solver.ddim_step(pred_x0, pred_noise, index)
 
-                # 20.4.12. Get target LCM prediction on x_prev, w, c, t_n
+                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
+                # Note that we do not use a separate target network for LCM-LoRA distillation.
                 with torch.no_grad():
                     with torch.autocast("cuda", enabled=True, dtype=weight_dtype):
                         target_noise_pred = unet(
@@ -1299,7 +1349,7 @@ def compute_embeddings(
                             encoder_hidden_states=prompt_embeds.float(),
                             added_cond_kwargs=encoded_text,
                         ).sample
-                    pred_x_0 = predicted_origin(
+                    pred_x_0 = get_predicted_original_sample(
                         target_noise_pred,
                         timesteps,
                         x_prev,
@@ -1309,7 +1359,7 @@ def compute_embeddings(
                     )
                     target = c_skip * x_prev + c_out * pred_x_0
 
-                # 20.4.13. Calculate loss
+                # 10. Calculate loss
                 if args.loss_type == "l2":
                     loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                 elif args.loss_type == "huber":
@@ -1317,7 +1367,7 @@ def compute_embeddings(
                         torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
                     )
 
-                # 20.4.14. Backpropagate on the online student model (`unet`)
+                # 11. Backpropagate on the online student model (`unet`)
                 accelerator.backward(loss)
                 if accelerator.sync_gradients:
                     accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
diff --git a/examples/consistency_distillation/train_lcm_distill_sd_wds.py b/examples/consistency_distillation/train_lcm_distill_sd_wds.py
index 1dfac0464271..54d05bb5ea26 100644
--- a/examples/consistency_distillation/train_lcm_distill_sd_wds.py
+++ b/examples/consistency_distillation/train_lcm_distill_sd_wds.py
@@ -138,7 +138,7 @@ def __call__(self, x):
             return False
 
 
-class Text2ImageDataset:
+class SDText2ImageDataset:
     def __init__(
         self,
         train_shards_path_or_url: Union[str, List[str]],
@@ -336,19 +336,43 @@ def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=
 
 
 # Compare LCMScheduler.step, Step 4
-def predicted_origin(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
     if prediction_type == "epsilon":
-        sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
-        alphas = extract_into_tensor(alphas, timesteps, sample.shape)
         pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
     elif prediction_type == "v_prediction":
-        pred_x_0 = alphas[timesteps] * sample - sigmas[timesteps] * model_output
+        pred_x_0 = alphas * sample - sigmas * model_output
     else:
-        raise ValueError(f"Prediction type {prediction_type} currently not supported.")
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
 
     return pred_x_0
 
 
+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+
+
 def extract_into_tensor(a, t, x_shape):
     b, *_ = t.shape
     out = a.gather(-1, t)
@@ -823,34 +847,35 @@ def main(args):
         args.pretrained_teacher_model, subfolder="scheduler", revision=args.teacher_revision
     )
 
-    # The scheduler calculates the alpha and sigma schedule for us
+    # DDPMScheduler calculates the alpha and sigma noise schedules (based on the alpha bars) for us
     alpha_schedule = torch.sqrt(noise_scheduler.alphas_cumprod)
     sigma_schedule = torch.sqrt(1 - noise_scheduler.alphas_cumprod)
+    # Initialize the DDIM ODE solver for distillation.
     solver = DDIMSolver(
         noise_scheduler.alphas_cumprod.numpy(),
         timesteps=noise_scheduler.config.num_train_timesteps,
         ddim_timesteps=args.num_ddim_timesteps,
     )
 
-    # 2. Load tokenizers from SD-XL checkpoint.
+    # 2. Load tokenizers from SD 1.X/2.X checkpoint.
     tokenizer = AutoTokenizer.from_pretrained(
         args.pretrained_teacher_model, subfolder="tokenizer", revision=args.teacher_revision, use_fast=False
     )
 
-    # 3. Load text encoders from SD-1.5 checkpoint.
+    # 3. Load text encoders from SD 1.X/2.X checkpoint.
     # import correct text encoder classes
     text_encoder = CLIPTextModel.from_pretrained(
         args.pretrained_teacher_model, subfolder="text_encoder", revision=args.teacher_revision
     )
 
-    # 4. Load VAE from SD-XL checkpoint (or more stable VAE)
+    # 4. Load VAE from SD 1.X/2.X checkpoint
     vae = AutoencoderKL.from_pretrained(
         args.pretrained_teacher_model,
         subfolder="vae",
         revision=args.teacher_revision,
     )
 
-    # 5. Load teacher U-Net from SD-XL checkpoint
+    # 5. Load teacher U-Net from SD 1.X/2.X checkpoint
     teacher_unet = UNet2DConditionModel.from_pretrained(
         args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
     )
@@ -860,7 +885,7 @@ def main(args):
     text_encoder.requires_grad_(False)
     teacher_unet.requires_grad_(False)
 
-    # 8. Create online (`unet`) student U-Nets. This will be updated by the optimizer (e.g. via backpropagation.)
+    # 7. Create online student U-Net. This will be updated by the optimizer (e.g. via backpropagation.)
     # Add `time_cond_proj_dim` to the student U-Net if `teacher_unet.config.time_cond_proj_dim` is None
     if teacher_unet.config.time_cond_proj_dim is None:
         teacher_unet.config["time_cond_proj_dim"] = args.unet_time_cond_proj_dim
@@ -869,8 +894,8 @@ def main(args):
     unet.load_state_dict(teacher_unet.state_dict(), strict=False)
     unet.train()
 
-    # 9. Create target (`ema_unet`) student U-Net parameters. This will be updated via EMA updates (polyak averaging).
-    # Initialize from unet
+    # 8. Create target student U-Net. This will be updated via EMA updates (polyak averaging).
+    # Initialize from (online) unet
     target_unet = UNet2DConditionModel(**teacher_unet.config)
     target_unet.load_state_dict(unet.state_dict())
     target_unet.train()
@@ -887,7 +912,7 @@ def main(args):
             f"Controlnet loaded as datatype {accelerator.unwrap_model(unet).dtype}. {low_precision_error_string}"
         )
 
-    # 10. Handle mixed precision and device placement
+    # 9. Handle mixed precision and device placement
     # For mixed precision training we cast all non-trainable weigths to half-precision
     # as these weights are only used for inference, keeping weights in full precision is not required.
     weight_dtype = torch.float32
@@ -914,7 +939,7 @@ def main(args):
     sigma_schedule = sigma_schedule.to(accelerator.device)
     solver = solver.to(accelerator.device)
 
-    # 11. Handle saving and loading of checkpoints
+    # 10. Handle saving and loading of checkpoints
     # `accelerate` 0.16.0 will have better support for customized saving
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
         # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
@@ -948,7 +973,7 @@ def load_model_hook(models, input_dir):
         accelerator.register_save_state_pre_hook(save_model_hook)
         accelerator.register_load_state_pre_hook(load_model_hook)
 
-    # 12. Enable optimizations
+    # 11. Enable optimizations
     if args.enable_xformers_memory_efficient_attention:
         if is_xformers_available():
             import xformers
@@ -994,13 +1019,14 @@ def load_model_hook(models, input_dir):
         eps=args.adam_epsilon,
     )
 
+    # 13. Dataset creation and data processing
     # Here, we compute not just the text embeddings but also the additional embeddings
     # needed for the SD XL UNet to operate.
     def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tokenizer, is_train=True):
         prompt_embeds = encode_prompt(prompt_batch, text_encoder, tokenizer, proportion_empty_prompts, is_train)
         return {"prompt_embeds": prompt_embeds}
 
-    dataset = Text2ImageDataset(
+    dataset = SDText2ImageDataset(
         train_shards_path_or_url=args.train_shards_path_or_url,
         num_train_examples=args.max_train_samples,
         per_gpu_batch_size=args.train_batch_size,
@@ -1020,6 +1046,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
         tokenizer=tokenizer,
     )
 
+    # 14. LR Scheduler creation
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
     num_update_steps_per_epoch = math.ceil(train_dataloader.num_batches / args.gradient_accumulation_steps)
@@ -1034,6 +1061,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
         num_training_steps=args.max_train_steps,
     )
 
+    # 15. Prepare for training
     # Prepare everything with our `accelerator`.
     unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
 
@@ -1055,7 +1083,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
     ).input_ids.to(accelerator.device)
     uncond_prompt_embeds = text_encoder(uncond_input_ids)[0]
 
-    # Train!
+    # 16. Train!
     total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
 
     logger.info("***** Running training *****")
@@ -1106,6 +1134,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
     for epoch in range(first_epoch, args.num_train_epochs):
         for step, batch in enumerate(train_dataloader):
             with accelerator.accumulate(unet):
+                # 1. Load and process the image and text conditioning
                 image, text = batch
 
                 image = image.to(accelerator.device, non_blocking=True)
@@ -1123,29 +1152,28 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
 
                 latents = latents * vae.config.scaling_factor
                 latents = latents.to(weight_dtype)
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
                 bsz = latents.shape[0]
 
-                # Sample a random timestep for each image t_n ~ U[0, N - k - 1] without bias.
+                # 2. Sample a random timestep for each image t_n from the ODE solver timesteps without bias.
+                # For the DDIM solver, the timestep schedule is [T - 1, T - k - 1, T - 2 * k - 1, ...]
                 topk = noise_scheduler.config.num_train_timesteps // args.num_ddim_timesteps
                 index = torch.randint(0, args.num_ddim_timesteps, (bsz,), device=latents.device).long()
                 start_timesteps = solver.ddim_timesteps[index]
                 timesteps = start_timesteps - topk
                 timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)
 
-                # 20.4.4. Get boundary scalings for start_timesteps and (end) timesteps.
+                # 3. Get boundary scalings for start_timesteps and (end) timesteps.
                 c_skip_start, c_out_start = scalings_for_boundary_conditions(start_timesteps)
                 c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
                 c_skip, c_out = scalings_for_boundary_conditions(timesteps)
                 c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]
 
-                # 20.4.5. Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                # 4. Sample noise from the prior and add it to the latents according to the noise magnitude at each
+                # timestep (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                noise = torch.randn_like(latents)
                 noisy_model_input = noise_scheduler.add_noise(latents, noise, start_timesteps)
 
-                # 20.4.6. Sample a random guidance scale w from U[w_min, w_max] and embed it
+                # 5. Sample a random guidance scale w from U[w_min, w_max] and embed it
                 w = (args.w_max - args.w_min) * torch.rand((bsz,)) + args.w_min
                 w_embedding = guidance_scale_embedding(w, embedding_dim=unet.config.time_cond_proj_dim)
                 w = w.reshape(bsz, 1, 1, 1)
@@ -1153,10 +1181,10 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                 w = w.to(device=latents.device, dtype=latents.dtype)
                 w_embedding = w_embedding.to(device=latents.device, dtype=latents.dtype)
 
-                # 20.4.8. Prepare prompt embeds and unet_added_conditions
+                # 6. Prepare prompt embeds and unet_added_conditions
                 prompt_embeds = encoded_text.pop("prompt_embeds")
 
-                # 20.4.9. Get online LCM prediction on z_{t_{n + k}}, w, c, t_{n + k}
+                # 7. Get online LCM prediction on z_{t_{n + k}} (noisy_model_input), w, c, t_{n + k} (start_timesteps)
                 noise_pred = unet(
                     noisy_model_input,
                     start_timesteps,
@@ -1165,7 +1193,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                     added_cond_kwargs=encoded_text,
                 ).sample
 
-                pred_x_0 = predicted_origin(
+                pred_x_0 = get_predicted_original_sample(
                     noise_pred,
                     start_timesteps,
                     noisy_model_input,
@@ -1176,17 +1204,27 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
 
                 model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0
 
-                # 20.4.10. Use the ODE solver to predict the kth step in the augmented PF-ODE trajectory after
-                # noisy_latents with both the conditioning embedding c and unconditional embedding 0
-                # Get teacher model prediction on noisy_latents and conditional embedding
+                # 8. Compute the conditional and unconditional teacher model predictions to get CFG estimates of the
+                # predicted noise eps_0 and predicted original sample x_0, then run the ODE solver using these
+                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
+                # solver timestep.
                 with torch.no_grad():
                     with torch.autocast("cuda"):
+                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                         cond_teacher_output = teacher_unet(
                             noisy_model_input.to(weight_dtype),
                             start_timesteps,
                             encoder_hidden_states=prompt_embeds.to(weight_dtype),
                         ).sample
-                        cond_pred_x0 = predicted_origin(
+                        cond_pred_x0 = get_predicted_original_sample(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        cond_pred_noise = get_predicted_noise(
                             cond_teacher_output,
                             start_timesteps,
                             noisy_model_input,
@@ -1195,13 +1233,21 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                             sigma_schedule,
                         )
 
-                        # Get teacher model prediction on noisy_latents and unconditional embedding
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
                         uncond_teacher_output = teacher_unet(
                             noisy_model_input.to(weight_dtype),
                             start_timesteps,
                             encoder_hidden_states=uncond_prompt_embeds.to(weight_dtype),
                         ).sample
-                        uncond_pred_x0 = predicted_origin(
+                        uncond_pred_x0 = get_predicted_original_sample(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        uncond_pred_noise = get_predicted_noise(
                             uncond_teacher_output,
                             start_timesteps,
                             noisy_model_input,
@@ -1210,12 +1256,16 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                             sigma_schedule,
                         )
 
-                        # 20.4.11. Perform "CFG" to get x_prev estimate (using the LCM paper's CFG formulation)
+                        # 3. Calculate the CFG estimate of x_0 (pred_x0) and eps_0 (pred_noise)
+                        # Note that this uses the LCM paper's CFG formulation rather than the Imagen CFG formulation
                         pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
-                        pred_noise = cond_teacher_output + w * (cond_teacher_output - uncond_teacher_output)
+                        pred_noise = cond_pred_noise + w * (cond_pred_noise - uncond_pred_noise)
+                        # 4. Run one step of the ODE solver to estimate the next point x_prev on the
+                        # augmented PF-ODE trajectory (solving backward in time)
+                        # Note that the DDIM step depends on both the predicted x_0 and source noise eps_0.
                         x_prev = solver.ddim_step(pred_x0, pred_noise, index)
 
-                # 20.4.12. Get target LCM prediction on x_prev, w, c, t_n
+                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
                 with torch.no_grad():
                     with torch.autocast("cuda", dtype=weight_dtype):
                         target_noise_pred = target_unet(
@@ -1224,7 +1274,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                             timestep_cond=w_embedding,
                             encoder_hidden_states=prompt_embeds.float(),
                         ).sample
-                    pred_x_0 = predicted_origin(
+                    pred_x_0 = get_predicted_original_sample(
                         target_noise_pred,
                         timesteps,
                         x_prev,
@@ -1234,7 +1284,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                     )
                     target = c_skip * x_prev + c_out * pred_x_0
 
-                # 20.4.13. Calculate loss
+                # 10. Calculate loss
                 if args.loss_type == "l2":
                     loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                 elif args.loss_type == "huber":
@@ -1242,7 +1292,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                         torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
                     )
 
-                # 20.4.14. Backpropagate on the online student model (`unet`)
+                # 11. Backpropagate on the online student model (`unet`)
                 accelerator.backward(loss)
                 if accelerator.sync_gradients:
                     accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
@@ -1252,7 +1302,7 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
 
             # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:
-                # 20.4.15. Make EMA update to target student model parameters
+                # 12. Make EMA update to target student model parameters (`target_unet`)
                 update_ema(target_unet.parameters(), unet.parameters(), args.ema_decay)
                 progress_bar.update(1)
                 global_step += 1
diff --git a/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py b/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py
index 952bec67d148..e58db46c9811 100644
--- a/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py
+++ b/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py
@@ -144,7 +144,7 @@ def __call__(self, x):
             return False
 
 
-class Text2ImageDataset:
+class SDXLText2ImageDataset:
     def __init__(
         self,
         train_shards_path_or_url: Union[str, List[str]],
@@ -324,19 +324,43 @@ def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=
 
 
 # Compare LCMScheduler.step, Step 4
-def predicted_origin(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
     if prediction_type == "epsilon":
-        sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
-        alphas = extract_into_tensor(alphas, timesteps, sample.shape)
         pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
     elif prediction_type == "v_prediction":
-        pred_x_0 = alphas[timesteps] * sample - sigmas[timesteps] * model_output
+        pred_x_0 = alphas * sample - sigmas * model_output
     else:
-        raise ValueError(f"Prediction type {prediction_type} currently not supported.")
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
 
     return pred_x_0
 
 
+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+
+
 def extract_into_tensor(a, t, x_shape):
     b, *_ = t.shape
     out = a.gather(-1, t)
@@ -863,9 +887,10 @@ def main(args):
         args.pretrained_teacher_model, subfolder="scheduler", revision=args.teacher_revision
     )
 
-    # The scheduler calculates the alpha and sigma schedule for us
+    # DDPMScheduler calculates the alpha and sigma noise schedules (based on the alpha bars) for us
     alpha_schedule = torch.sqrt(noise_scheduler.alphas_cumprod)
     sigma_schedule = torch.sqrt(1 - noise_scheduler.alphas_cumprod)
+    # Initialize the DDIM ODE solver for distillation.
     solver = DDIMSolver(
         noise_scheduler.alphas_cumprod.numpy(),
         timesteps=noise_scheduler.config.num_train_timesteps,
@@ -919,7 +944,7 @@ def main(args):
     text_encoder_two.requires_grad_(False)
     teacher_unet.requires_grad_(False)
 
-    # 8. Create online (`unet`) student U-Nets. This will be updated by the optimizer (e.g. via backpropagation.)
+    # 7. Create online student U-Net. This will be updated by the optimizer (e.g. via backpropagation.)
     # Add `time_cond_proj_dim` to the student U-Net if `teacher_unet.config.time_cond_proj_dim` is None
     if teacher_unet.config.time_cond_proj_dim is None:
         teacher_unet.config["time_cond_proj_dim"] = args.unet_time_cond_proj_dim
@@ -928,8 +953,8 @@ def main(args):
     unet.load_state_dict(teacher_unet.state_dict(), strict=False)
     unet.train()
 
-    # 9. Create target (`ema_unet`) student U-Net parameters. This will be updated via EMA updates (polyak averaging).
-    # Initialize from unet
+    # 8. Create target student U-Net. This will be updated via EMA updates (polyak averaging).
+    # Initialize from (online) unet
     target_unet = UNet2DConditionModel(**teacher_unet.config)
     target_unet.load_state_dict(unet.state_dict())
     target_unet.train()
@@ -971,6 +996,7 @@ def main(args):
     # Also move the alpha and sigma noise schedules to accelerator.device.
     alpha_schedule = alpha_schedule.to(accelerator.device)
     sigma_schedule = sigma_schedule.to(accelerator.device)
+    # Move the ODE solver to accelerator.device.
     solver = solver.to(accelerator.device)
 
     # 10. Handle saving and loading of checkpoints
@@ -1084,7 +1110,7 @@ def compute_embeddings(
 
         return {"prompt_embeds": prompt_embeds, **unet_added_cond_kwargs}
 
-    dataset = Text2ImageDataset(
+    dataset = SDXLText2ImageDataset(
         train_shards_path_or_url=args.train_shards_path_or_url,
         num_train_examples=args.max_train_samples,
         per_gpu_batch_size=args.train_batch_size,
@@ -1202,6 +1228,7 @@ def compute_embeddings(
     for epoch in range(first_epoch, args.num_train_epochs):
         for step, batch in enumerate(train_dataloader):
             with accelerator.accumulate(unet):
+                # 1. Load and process the image, text, and micro-conditioning (original image size, crop coordinates)
                 image, text, orig_size, crop_coords = batch
 
                 image = image.to(accelerator.device, non_blocking=True)
@@ -1223,38 +1250,39 @@ def compute_embeddings(
                 latents = latents * vae.config.scaling_factor
                 if args.pretrained_vae_model_name_or_path is None:
                     latents = latents.to(weight_dtype)
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
                 bsz = latents.shape[0]
 
-                # Sample a random timestep for each image t_n ~ U[0, N - k - 1] without bias.
+                # 2. Sample a random timestep for each image t_n from the ODE solver timesteps without bias.
+                # For the DDIM solver, the timestep schedule is [T - 1, T - k - 1, T - 2 * k - 1, ...]
                 topk = noise_scheduler.config.num_train_timesteps // args.num_ddim_timesteps
                 index = torch.randint(0, args.num_ddim_timesteps, (bsz,), device=latents.device).long()
                 start_timesteps = solver.ddim_timesteps[index]
                 timesteps = start_timesteps - topk
                 timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)
 
-                # 20.4.4. Get boundary scalings for start_timesteps and (end) timesteps.
+                # 3. Get boundary scalings for start_timesteps and (end) timesteps.
                 c_skip_start, c_out_start = scalings_for_boundary_conditions(start_timesteps)
                 c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
                 c_skip, c_out = scalings_for_boundary_conditions(timesteps)
                 c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]
 
-                # 20.4.5. Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                # 4. Sample noise from the prior and add it to the latents according to the noise magnitude at each
+                # timestep (this is the forward diffusion process) [z_{t_{n + k}} in Algorithm 1]
+                noise = torch.randn_like(latents)
                 noisy_model_input = noise_scheduler.add_noise(latents, noise, start_timesteps)
 
-                # 20.4.6. Sample a random guidance scale w from U[w_min, w_max] and embed it
+                # 5. Sample a random guidance scale w from U[w_min, w_max] and embed it
                 w = (args.w_max - args.w_min) * torch.rand((bsz,)) + args.w_min
                 w_embedding = guidance_scale_embedding(w, embedding_dim=unet.config.time_cond_proj_dim)
                 w = w.reshape(bsz, 1, 1, 1)
+                # Move to U-Net device and dtype
                 w = w.to(device=latents.device, dtype=latents.dtype)
+                w_embedding = w_embedding.to(device=latents.device, dtype=latents.dtype)
 
-                # 20.4.8. Prepare prompt embeds and unet_added_conditions
+                # 6. Prepare prompt embeds and unet_added_conditions
                 prompt_embeds = encoded_text.pop("prompt_embeds")
 
-                # 20.4.9. Get online LCM prediction on z_{t_{n + k}}, w, c, t_{n + k}
+                # 7. Get online LCM prediction on z_{t_{n + k}} (noisy_model_input), w, c, t_{n + k} (start_timesteps)
                 noise_pred = unet(
                     noisy_model_input,
                     start_timesteps,
@@ -1263,7 +1291,7 @@ def compute_embeddings(
                     added_cond_kwargs=encoded_text,
                 ).sample
 
-                pred_x_0 = predicted_origin(
+                pred_x_0 = get_predicted_original_sample(
                     noise_pred,
                     start_timesteps,
                     noisy_model_input,
@@ -1274,18 +1302,28 @@ def compute_embeddings(
 
                 model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0
 
-                # 20.4.10. Use the ODE solver to predict the kth step in the augmented PF-ODE trajectory after
-                # noisy_latents with both the conditioning embedding c and unconditional embedding 0
-                # Get teacher model prediction on noisy_latents and conditional embedding
+                # 8. Compute the conditional and unconditional teacher model predictions to get CFG estimates of the
+                # predicted noise eps_0 and predicted original sample x_0, then run the ODE solver using these
+                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
+                # solver timestep.
                 with torch.no_grad():
                     with torch.autocast("cuda"):
+                        # 1. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and conditional embedding c
                         cond_teacher_output = teacher_unet(
                             noisy_model_input.to(weight_dtype),
                             start_timesteps,
                             encoder_hidden_states=prompt_embeds.to(weight_dtype),
                             added_cond_kwargs={k: v.to(weight_dtype) for k, v in encoded_text.items()},
                         ).sample
-                        cond_pred_x0 = predicted_origin(
+                        cond_pred_x0 = get_predicted_original_sample(
+                            cond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        cond_pred_noise = get_predicted_noise(
                             cond_teacher_output,
                             start_timesteps,
                             noisy_model_input,
@@ -1294,7 +1332,7 @@ def compute_embeddings(
                             sigma_schedule,
                         )
 
-                        # Get teacher model prediction on noisy_latents and unconditional embedding
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
                         uncond_added_conditions = copy.deepcopy(encoded_text)
                         uncond_added_conditions["text_embeds"] = uncond_pooled_prompt_embeds
                         uncond_teacher_output = teacher_unet(
@@ -1303,7 +1341,15 @@ def compute_embeddings(
                             encoder_hidden_states=uncond_prompt_embeds.to(weight_dtype),
                             added_cond_kwargs={k: v.to(weight_dtype) for k, v in uncond_added_conditions.items()},
                         ).sample
-                        uncond_pred_x0 = predicted_origin(
+                        uncond_pred_x0 = get_predicted_original_sample(
+                            uncond_teacher_output,
+                            start_timesteps,
+                            noisy_model_input,
+                            noise_scheduler.config.prediction_type,
+                            alpha_schedule,
+                            sigma_schedule,
+                        )
+                        uncond_pred_noise = get_predicted_noise(
                             uncond_teacher_output,
                             start_timesteps,
                             noisy_model_input,
@@ -1312,12 +1358,16 @@ def compute_embeddings(
                             sigma_schedule,
                         )
 
-                        # 20.4.11. Perform "CFG" to get x_prev estimate (using the LCM paper's CFG formulation)
+                        # 3. Calculate the CFG estimate of x_0 (pred_x0) and eps_0 (pred_noise)
+                        # Note that this uses the LCM paper's CFG formulation rather than the Imagen CFG formulation
                         pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
-                        pred_noise = cond_teacher_output + w * (cond_teacher_output - uncond_teacher_output)
+                        pred_noise = cond_pred_noise + w * (cond_pred_noise - uncond_pred_noise)
+                        # 4. Run one step of the ODE solver to estimate the next point x_prev on the
+                        # augmented PF-ODE trajectory (solving backward in time)
+                        # Note that the DDIM step depends on both the predicted x_0 and source noise eps_0.
                         x_prev = solver.ddim_step(pred_x0, pred_noise, index)
 
-                # 20.4.12. Get target LCM prediction on x_prev, w, c, t_n
+                # 9. Get target LCM prediction on x_prev, w, c, t_n (timesteps)
                 with torch.no_grad():
                     with torch.autocast("cuda", dtype=weight_dtype):
                         target_noise_pred = target_unet(
@@ -1327,7 +1377,7 @@ def compute_embeddings(
                             encoder_hidden_states=prompt_embeds.float(),
                             added_cond_kwargs=encoded_text,
                         ).sample
-                    pred_x_0 = predicted_origin(
+                    pred_x_0 = get_predicted_original_sample(
                         target_noise_pred,
                         timesteps,
                         x_prev,
@@ -1337,7 +1387,7 @@ def compute_embeddings(
                     )
                     target = c_skip * x_prev + c_out * pred_x_0
 
-                # 20.4.13. Calculate loss
+                # 10. Calculate loss
                 if args.loss_type == "l2":
                     loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                 elif args.loss_type == "huber":
@@ -1345,7 +1395,7 @@ def compute_embeddings(
                         torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
                     )
 
-                # 20.4.14. Backpropagate on the online student model (`unet`)
+                # 11. Backpropagate on the online student model (`unet`)
                 accelerator.backward(loss)
                 if accelerator.sync_gradients:
                     accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
@@ -1355,7 +1405,7 @@ def compute_embeddings(
 
             # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:
-                # 20.4.15. Make EMA update to target student model parameters
+                # 12. Make EMA update to target student model parameters (`target_unet`)
                 update_ema(target_unet.parameters(), unet.parameters(), args.ema_decay)
                 progress_bar.update(1)
                 global_step += 1

From d704a730cd6d2961d9ec17d7a041c4e9f7e13812 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 15 Dec 2023 18:34:46 +0530
Subject: [PATCH 11/30] Compile test fix (#6104)

* update

* update
---
 src/diffusers/utils/testing_utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 14b89c3cd3b9..606980f8a3c5 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -820,7 +820,9 @@ def _is_torch_fp16_available(device):
 
     try:
         x = torch.zeros((2, 2), dtype=torch.float16).to(device)
-        _ = x @ x
+        _ = torch.mul(x, x)
+        return True
+
     except Exception as e:
         if device.type == "cuda":
             raise ValueError(
@@ -838,7 +840,9 @@ def _is_torch_fp64_available(device):
 
     try:
         x = torch.zeros((2, 2), dtype=torch.float64).to(device)
-        _ = x @ x
+        _ = torch.mul(x, x)
+        return True
+
     except Exception as e:
         if device.type == "cuda":
             raise ValueError(

From a81334e3f01550f79417af81caa92a9da58f135b Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sat, 16 Dec 2023 08:36:08 +0530
Subject: [PATCH 12/30] [LoRA] add an error message when dealing with
 _best_guess_weight_name ofline (#6184)

* add an error message when dealing with _best_guess_weight_name ofline

* simplify condition
---
 src/diffusers/loaders/lora.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora.py
index 3955fc2a1395..c1c3a260ec11 100644
--- a/src/diffusers/loaders/lora.py
+++ b/src/diffusers/loaders/lora.py
@@ -18,6 +18,7 @@
 import safetensors
 import torch
 from huggingface_hub import model_info
+from huggingface_hub.constants import HF_HUB_OFFLINE
 from huggingface_hub.utils import validate_hf_hub_args
 from packaging import version
 from torch import nn
@@ -229,7 +230,9 @@ def lora_state_dict(
                     # determine `weight_name`.
                     if weight_name is None:
                         weight_name = cls._best_guess_weight_name(
-                            pretrained_model_name_or_path_or_dict, file_extension=".safetensors"
+                            pretrained_model_name_or_path_or_dict,
+                            file_extension=".safetensors",
+                            local_files_only=local_files_only,
                         )
                     model_file = _get_model_file(
                         pretrained_model_name_or_path_or_dict,
@@ -255,7 +258,7 @@ def lora_state_dict(
             if model_file is None:
                 if weight_name is None:
                     weight_name = cls._best_guess_weight_name(
-                        pretrained_model_name_or_path_or_dict, file_extension=".bin"
+                        pretrained_model_name_or_path_or_dict, file_extension=".bin", local_files_only=local_files_only
                     )
                 model_file = _get_model_file(
                     pretrained_model_name_or_path_or_dict,
@@ -294,7 +297,12 @@ def lora_state_dict(
         return state_dict, network_alphas
 
     @classmethod
-    def _best_guess_weight_name(cls, pretrained_model_name_or_path_or_dict, file_extension=".safetensors"):
+    def _best_guess_weight_name(
+        cls, pretrained_model_name_or_path_or_dict, file_extension=".safetensors", local_files_only=False
+    ):
+        if local_files_only or HF_HUB_OFFLINE:
+            raise ValueError("When using the offline mode, you must specify a `weight_name`.")
+
         targeted_files = []
 
         if os.path.isfile(pretrained_model_name_or_path_or_dict):

From 2d94c7838e273c40920ffd6d24d724357add7f2d Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sat, 16 Dec 2023 08:45:54 +0530
Subject: [PATCH 13/30] [Core] feat: enable fused attention projections for
 other SD and SDXL pipelines (#6179)

* feat: enable fused attention projections for other SD and SDXL pipelines

* add: test for SD fused projections.
---
 .../alt_diffusion/pipeline_alt_diffusion.py   | 60 ++++++++++++++++++
 .../pipeline_alt_diffusion_img2img.py         | 60 ++++++++++++++++++
 .../pipeline_stable_diffusion.py              | 62 +++++++++++++++++++
 .../pipeline_stable_diffusion_img2img.py      | 62 +++++++++++++++++++
 .../pipeline_stable_diffusion_inpaint.py      | 62 +++++++++++++++++++
 .../pipeline_stable_diffusion_xl_img2img.py   | 62 +++++++++++++++++++
 .../pipeline_stable_diffusion_xl_inpaint.py   | 62 +++++++++++++++++++
 .../stable_diffusion/test_stable_diffusion.py | 31 ++++++++++
 8 files changed, 461 insertions(+)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index 2121e9b81509..dfeddab6dced 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -23,6 +23,7 @@
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import FusedAttnProcessor2_0
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -655,6 +656,65 @@ def disable_freeu(self):
         """Disables the FreeU mechanism if enabled."""
         self.unet.disable_freeu()
 
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
         See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 401e6aef82b1..d87a9eaa1e8d 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -25,6 +25,7 @@
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import FusedAttnProcessor2_0
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -715,6 +716,65 @@ def disable_freeu(self):
         """Disables the FreeU mechanism if enabled."""
         self.unet.disable_freeu()
 
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
         See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index f7f4a16f0aa4..2ad90f049922 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -23,6 +23,7 @@
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import FusedAttnProcessor2_0
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -650,6 +651,67 @@ def disable_freeu(self):
         """Disables the FreeU mechanism if enabled."""
         self.unet.disable_freeu()
 
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index c80178152a6e..d7e0952b2aa4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -25,6 +25,7 @@
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import FusedAttnProcessor2_0
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -718,6 +719,67 @@ def disable_freeu(self):
         """Disables the FreeU mechanism if enabled."""
         self.unet.disable_freeu()
 
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 820c2eecb864..a321bb41a7eb 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -25,6 +25,7 @@
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AsymmetricAutoencoderKL, AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import FusedAttnProcessor2_0
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
@@ -844,6 +845,67 @@ def disable_freeu(self):
         """Disables the FreeU mechanism if enabled."""
         self.unet.disable_freeu()
 
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 23d8f97ffb52..97f99386acef 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -35,6 +35,7 @@
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
+    FusedAttnProcessor2_0,
     LoRAAttnProcessor2_0,
     LoRAXFormersAttnProcessor,
     XFormersAttnProcessor,
@@ -864,6 +865,67 @@ def disable_freeu(self):
         """Disables the FreeU mechanism if enabled."""
         self.unet.disable_freeu()
 
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index b714bfa4bd12..812f5499f8e6 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -36,6 +36,7 @@
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
 from ...models.attention_processor import (
     AttnProcessor2_0,
+    FusedAttnProcessor2_0,
     LoRAAttnProcessor2_0,
     LoRAXFormersAttnProcessor,
     XFormersAttnProcessor,
@@ -1084,6 +1085,67 @@ def disable_freeu(self):
         """Disables the FreeU mechanism if enabled."""
         self.unet.disable_freeu()
 
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 28d0d07e6948..ac105d22fa82 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -661,6 +661,37 @@ def test_freeu_disabled(self):
             output[0, -3:, -3:, -1], output_no_freeu[0, -3:, -3:, -1]
         ), "Disabling of FreeU should lead to results similar to the default pipeline results."
 
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        original_image_slice = image[0, -3:, -3:, -1]
+
+        sd_pipe.fuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice_fused = image[0, -3:, -3:, -1]
+
+        sd_pipe.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice_disabled = image[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            original_image_slice, image_slice_fused, atol=1e-2, rtol=1e-2
+        ), "Fusion of QKV projections shouldn't affect the outputs."
+        assert np.allclose(
+            image_slice_fused, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        assert np.allclose(
+            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+        ), "Original outputs should match when fused QKV projections are disabled."
+
 
 @slow
 @require_torch_gpu

From 9cef07da5a1f28ddfeadd232ede9a815c7945aec Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 17 Dec 2023 15:32:11 +0530
Subject: [PATCH 14/30] [Benchmarks] fix: lcm benchmarking reporting (#6198)

* fix: lcm benchmarking reporting

* fix generate_csv_dict call.
---
 benchmarks/base_classes.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 5d328f62b904..38e7663fbf6e 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -162,6 +162,25 @@ def run_inference(self, pipe, args):
             guidance_scale=1.0,
         )
 
+    def benchmark(self, args):
+        flush()
+
+        print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n")
+
+        time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
+        memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
+        benchmark_info = BenchmarkInfo(time=time, memory=memory)
+
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        flush()
+        csv_dict = generate_csv_dict(
+            pipeline_cls=pipeline_class_name, ckpt=self.lora_id, args=args, benchmark_info=benchmark_info
+        )
+        filepath = self.get_result_filepath(args)
+        write_to_csv(filepath, csv_dict)
+        print(f"Logs written to: {filepath}")
+        flush()
+
 
 class ImageToImageBenchmark(TextToImageBenchmark):
     pipeline_class = AutoPipelineForImage2Image

From 56b3b216936affc398220c8e8e49ceba5db7bf9a Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 18 Dec 2023 12:42:15 +0530
Subject: [PATCH 15/30] [Refactor autoencoders] feat: introduce autoencoders
 module (#6129)

* feat: introduce autoencoders module

* more changes for styling and copy fixing

* path changes in the docs.

* fix: import structure in init.

* fix controlnetxs import
---
 .../en/api/models/asymmetricautoencoderkl.md  |  6 ++--
 docs/source/en/api/models/autoencoder_tiny.md |  2 +-
 docs/source/en/api/models/autoencoderkl.md    |  4 +--
 scripts/convert_consistency_decoder.py        |  2 +-
 src/diffusers/models/__init__.py              | 22 ++++++++-------
 src/diffusers/models/autoencoders/__init__.py |  5 ++++
 .../{ => autoencoders}/autoencoder_asym_kl.py |  8 +++---
 .../{ => autoencoders}/autoencoder_kl.py      | 12 ++++----
 .../autoencoder_kl_temporal_decoder.py        | 16 +++++------
 .../{ => autoencoders}/autoencoder_tiny.py    |  8 +++---
 .../consistency_decoder_vae.py                | 28 +++++++++----------
 .../models/{ => autoencoders}/vae.py          | 10 +++----
 src/diffusers/models/controlnetxs.py          |  2 +-
 src/diffusers/models/vq_model.py              |  2 +-
 .../wuerstchen/modeling_paella_vq_model.py    |  2 +-
 15 files changed, 68 insertions(+), 61 deletions(-)
 create mode 100644 src/diffusers/models/autoencoders/__init__.py
 rename src/diffusers/models/{ => autoencoders}/autoencoder_asym_kl.py (97%)
 rename src/diffusers/models/{ => autoencoders}/autoencoder_kl.py (98%)
 rename src/diffusers/models/{ => autoencoders}/autoencoder_kl_temporal_decoder.py (97%)
 rename src/diffusers/models/{ => autoencoders}/autoencoder_tiny.py (98%)
 rename src/diffusers/models/{ => autoencoders}/consistency_decoder_vae.py (95%)
 rename src/diffusers/models/{ => autoencoders}/vae.py (99%)

diff --git a/docs/source/en/api/models/asymmetricautoencoderkl.md b/docs/source/en/api/models/asymmetricautoencoderkl.md
index 1e102943c5e4..fdc71df7a999 100644
--- a/docs/source/en/api/models/asymmetricautoencoderkl.md
+++ b/docs/source/en/api/models/asymmetricautoencoderkl.md
@@ -49,12 +49,12 @@ make_image_grid([original_image, mask_image, image], rows=1, cols=3)
 
 ## AsymmetricAutoencoderKL
 
-[[autodoc]] models.autoencoder_asym_kl.AsymmetricAutoencoderKL
+[[autodoc]] models.autoencoders.autoencoder_asym_kl.AsymmetricAutoencoderKL
 
 ## AutoencoderKLOutput
 
-[[autodoc]] models.autoencoder_kl.AutoencoderKLOutput
+[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
 
 ## DecoderOutput
 
-[[autodoc]] models.vae.DecoderOutput
+[[autodoc]] models.autoencoders.vae.DecoderOutput
diff --git a/docs/source/en/api/models/autoencoder_tiny.md b/docs/source/en/api/models/autoencoder_tiny.md
index 1d19539bffe8..b5c9dc638e6f 100644
--- a/docs/source/en/api/models/autoencoder_tiny.md
+++ b/docs/source/en/api/models/autoencoder_tiny.md
@@ -54,4 +54,4 @@ image
 
 ## AutoencoderTinyOutput
 
-[[autodoc]] models.autoencoder_tiny.AutoencoderTinyOutput
+[[autodoc]] models.autoencoders.autoencoder_tiny.AutoencoderTinyOutput
diff --git a/docs/source/en/api/models/autoencoderkl.md b/docs/source/en/api/models/autoencoderkl.md
index f42a4d2941dd..72427ab30e6a 100644
--- a/docs/source/en/api/models/autoencoderkl.md
+++ b/docs/source/en/api/models/autoencoderkl.md
@@ -36,11 +36,11 @@ model = AutoencoderKL.from_single_file(url)
 
 ## AutoencoderKLOutput
 
-[[autodoc]] models.autoencoder_kl.AutoencoderKLOutput
+[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
 
 ## DecoderOutput
 
-[[autodoc]] models.vae.DecoderOutput
+[[autodoc]] models.autoencoders.vae.DecoderOutput
 
 ## FlaxAutoencoderKL
 
diff --git a/scripts/convert_consistency_decoder.py b/scripts/convert_consistency_decoder.py
index 6a294038a5a3..3319f4c4665e 100644
--- a/scripts/convert_consistency_decoder.py
+++ b/scripts/convert_consistency_decoder.py
@@ -12,9 +12,9 @@
 from tqdm import tqdm
 
 from diffusers import AutoencoderKL, ConsistencyDecoderVAE, DiffusionPipeline, StableDiffusionPipeline, UNet2DModel
+from diffusers.models.autoencoders.vae import Encoder
 from diffusers.models.embeddings import TimestepEmbedding
 from diffusers.models.unet_2d_blocks import ResnetDownsampleBlock2D, ResnetUpsampleBlock2D, UNetMidBlock2D
-from diffusers.models.vae import Encoder
 
 
 args = ArgumentParser()
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index ec1c7ab43494..7487bbf2f98e 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -26,11 +26,11 @@
 
 if is_torch_available():
     _import_structure["adapter"] = ["MultiAdapter", "T2IAdapter"]
-    _import_structure["autoencoder_asym_kl"] = ["AsymmetricAutoencoderKL"]
-    _import_structure["autoencoder_kl"] = ["AutoencoderKL"]
-    _import_structure["autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
-    _import_structure["autoencoder_tiny"] = ["AutoencoderTiny"]
-    _import_structure["consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
+    _import_structure["autoencoders.autoencoder_asym_kl"] = ["AsymmetricAutoencoderKL"]
+    _import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
+    _import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
+    _import_structure["autoencoders.autoencoder_tiny"] = ["AutoencoderTiny"]
+    _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
     _import_structure["controlnet"] = ["ControlNetModel"]
     _import_structure["controlnetxs"] = ["ControlNetXSModel"]
     _import_structure["dual_transformer_2d"] = ["DualTransformer2DModel"]
@@ -58,11 +58,13 @@
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     if is_torch_available():
         from .adapter import MultiAdapter, T2IAdapter
-        from .autoencoder_asym_kl import AsymmetricAutoencoderKL
-        from .autoencoder_kl import AutoencoderKL
-        from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
-        from .autoencoder_tiny import AutoencoderTiny
-        from .consistency_decoder_vae import ConsistencyDecoderVAE
+        from .autoencoders import (
+            AsymmetricAutoencoderKL,
+            AutoencoderKL,
+            AutoencoderKLTemporalDecoder,
+            AutoencoderTiny,
+            ConsistencyDecoderVAE,
+        )
         from .controlnet import ControlNetModel
         from .controlnetxs import ControlNetXSModel
         from .dual_transformer_2d import DualTransformer2DModel
diff --git a/src/diffusers/models/autoencoders/__init__.py b/src/diffusers/models/autoencoders/__init__.py
new file mode 100644
index 000000000000..201a40ff17b2
--- /dev/null
+++ b/src/diffusers/models/autoencoders/__init__.py
@@ -0,0 +1,5 @@
+from .autoencoder_asym_kl import AsymmetricAutoencoderKL
+from .autoencoder_kl import AutoencoderKL
+from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
+from .autoencoder_tiny import AutoencoderTiny
+from .consistency_decoder_vae import ConsistencyDecoderVAE
diff --git a/src/diffusers/models/autoencoder_asym_kl.py b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
similarity index 97%
rename from src/diffusers/models/autoencoder_asym_kl.py
rename to src/diffusers/models/autoencoders/autoencoder_asym_kl.py
index 678e47234096..9114650619fc 100644
--- a/src/diffusers/models/autoencoder_asym_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
@@ -16,10 +16,10 @@
 import torch
 import torch.nn as nn
 
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils.accelerate_utils import apply_forward_hook
-from .modeling_outputs import AutoencoderKLOutput
-from .modeling_utils import ModelMixin
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils.accelerate_utils import apply_forward_hook
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
 from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder, MaskConditionDecoder
 
 
diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py
similarity index 98%
rename from src/diffusers/models/autoencoder_kl.py
rename to src/diffusers/models/autoencoders/autoencoder_kl.py
index 8fa3574125f9..ae2d90c548f8 100644
--- a/src/diffusers/models/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -16,10 +16,10 @@
 import torch
 import torch.nn as nn
 
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import FromOriginalVAEMixin
-from ..utils.accelerate_utils import apply_forward_hook
-from .attention_processor import (
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalVAEMixin
+from ...utils.accelerate_utils import apply_forward_hook
+from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
     Attention,
@@ -27,8 +27,8 @@
     AttnAddedKVProcessor,
     AttnProcessor,
 )
-from .modeling_outputs import AutoencoderKLOutput
-from .modeling_utils import ModelMixin
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
 from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
 
 
diff --git a/src/diffusers/models/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
similarity index 97%
rename from src/diffusers/models/autoencoder_kl_temporal_decoder.py
rename to src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
index 176b6e0df924..0b7f8d1f5336 100644
--- a/src/diffusers/models/autoencoder_kl_temporal_decoder.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
@@ -16,14 +16,14 @@
 import torch
 import torch.nn as nn
 
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import FromOriginalVAEMixin
-from ..utils import is_torch_version
-from ..utils.accelerate_utils import apply_forward_hook
-from .attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
-from .modeling_outputs import AutoencoderKLOutput
-from .modeling_utils import ModelMixin
-from .unet_3d_blocks import MidBlockTemporalDecoder, UpBlockTemporalDecoder
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalVAEMixin
+from ...utils import is_torch_version
+from ...utils.accelerate_utils import apply_forward_hook
+from ..attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from ..unet_3d_blocks import MidBlockTemporalDecoder, UpBlockTemporalDecoder
 from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder
 
 
diff --git a/src/diffusers/models/autoencoder_tiny.py b/src/diffusers/models/autoencoders/autoencoder_tiny.py
similarity index 98%
rename from src/diffusers/models/autoencoder_tiny.py
rename to src/diffusers/models/autoencoders/autoencoder_tiny.py
index 56ccf30e0402..08b1c0e74d70 100644
--- a/src/diffusers/models/autoencoder_tiny.py
+++ b/src/diffusers/models/autoencoders/autoencoder_tiny.py
@@ -18,10 +18,10 @@
 
 import torch
 
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
-from ..utils.accelerate_utils import apply_forward_hook
-from .modeling_utils import ModelMixin
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ...utils.accelerate_utils import apply_forward_hook
+from ..modeling_utils import ModelMixin
 from .vae import DecoderOutput, DecoderTiny, EncoderTiny
 
 
diff --git a/src/diffusers/models/consistency_decoder_vae.py b/src/diffusers/models/autoencoders/consistency_decoder_vae.py
similarity index 95%
rename from src/diffusers/models/consistency_decoder_vae.py
rename to src/diffusers/models/autoencoders/consistency_decoder_vae.py
index 34176a35e835..d92423eafc31 100644
--- a/src/diffusers/models/consistency_decoder_vae.py
+++ b/src/diffusers/models/autoencoders/consistency_decoder_vae.py
@@ -18,20 +18,20 @@
 import torch.nn.functional as F
 from torch import nn
 
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..schedulers import ConsistencyDecoderScheduler
-from ..utils import BaseOutput
-from ..utils.accelerate_utils import apply_forward_hook
-from ..utils.torch_utils import randn_tensor
-from .attention_processor import (
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...schedulers import ConsistencyDecoderScheduler
+from ...utils import BaseOutput
+from ...utils.accelerate_utils import apply_forward_hook
+from ...utils.torch_utils import randn_tensor
+from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
     AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
 )
-from .modeling_utils import ModelMixin
-from .unet_2d import UNet2DModel
+from ..modeling_utils import ModelMixin
+from ..unet_2d import UNet2DModel
 from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder
 
 
@@ -153,7 +153,7 @@ def __init__(
         self.use_slicing = False
         self.use_tiling = False
 
-    # Copied from diffusers.models.autoencoder_kl.AutoencoderKL.enable_tiling
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.enable_tiling
     def enable_tiling(self, use_tiling: bool = True):
         r"""
         Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
@@ -162,7 +162,7 @@ def enable_tiling(self, use_tiling: bool = True):
         """
         self.use_tiling = use_tiling
 
-    # Copied from diffusers.models.autoencoder_kl.AutoencoderKL.disable_tiling
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.disable_tiling
     def disable_tiling(self):
         r"""
         Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
@@ -170,7 +170,7 @@ def disable_tiling(self):
         """
         self.enable_tiling(False)
 
-    # Copied from diffusers.models.autoencoder_kl.AutoencoderKL.enable_slicing
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.enable_slicing
     def enable_slicing(self):
         r"""
         Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
@@ -178,7 +178,7 @@ def enable_slicing(self):
         """
         self.use_slicing = True
 
-    # Copied from diffusers.models.autoencoder_kl.AutoencoderKL.disable_slicing
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.disable_slicing
     def disable_slicing(self):
         r"""
         Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
@@ -333,14 +333,14 @@ def decode(
 
         return DecoderOutput(sample=x_0)
 
-    # Copied from diffusers.models.autoencoder_kl.AutoencoderKL.blend_v
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.blend_v
     def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
         blend_extent = min(a.shape[2], b.shape[2], blend_extent)
         for y in range(blend_extent):
             b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
         return b
 
-    # Copied from diffusers.models.autoencoder_kl.AutoencoderKL.blend_h
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.blend_h
     def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
         blend_extent = min(a.shape[3], b.shape[3], blend_extent)
         for x in range(blend_extent):
diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/autoencoders/vae.py
similarity index 99%
rename from src/diffusers/models/vae.py
rename to src/diffusers/models/autoencoders/vae.py
index 0049456e2187..9ed0232e6983 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/autoencoders/vae.py
@@ -18,11 +18,11 @@
 import torch
 import torch.nn as nn
 
-from ..utils import BaseOutput, is_torch_version
-from ..utils.torch_utils import randn_tensor
-from .activations import get_activation
-from .attention_processor import SpatialNorm
-from .unet_2d_blocks import (
+from ...utils import BaseOutput, is_torch_version
+from ...utils.torch_utils import randn_tensor
+from ..activations import get_activation
+from ..attention_processor import SpatialNorm
+from ..unet_2d_blocks import (
     AutoencoderTinyBlock,
     UNetMidBlock2D,
     get_down_block,
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 41f2d8af01b1..3cc77fe70d72 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -26,7 +26,7 @@
 from .attention_processor import (
     AttentionProcessor,
 )
-from .autoencoder_kl import AutoencoderKL
+from .autoencoders import AutoencoderKL
 from .lora import LoRACompatibleConv
 from .modeling_utils import ModelMixin
 from .unet_2d_blocks import (
diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py
index f4a6c8fb227f..bfe62ec863b3 100644
--- a/src/diffusers/models/vq_model.py
+++ b/src/diffusers/models/vq_model.py
@@ -20,8 +20,8 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils import BaseOutput
 from ..utils.accelerate_utils import apply_forward_hook
+from .autoencoders.vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
 from .modeling_utils import ModelMixin
-from .vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
 
 
 @dataclass
diff --git a/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py b/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
index 7ee42faa0e82..3115cc2d9d3d 100644
--- a/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
+++ b/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
@@ -19,8 +19,8 @@
 import torch.nn as nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.autoencoders.vae import DecoderOutput, VectorQuantizer
 from ...models.modeling_utils import ModelMixin
-from ...models.vae import DecoderOutput, VectorQuantizer
 from ...models.vq_model import VQEncoderOutput
 from ...utils.accelerate_utils import apply_forward_hook
 

From 49644babd305c9d0898ecd97ce368800e8ad092b Mon Sep 17 00:00:00 2001
From: Yudong Jin <krahets@163.com>
Date: Mon, 18 Dec 2023 18:06:00 +0800
Subject: [PATCH 16/30] Fix the test script in examples/text_to_image/README.md
 (#6209)

* Update examples/text_to_image/README.md

* Update examples/text_to_image/README.md

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 examples/text_to_image/README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md
index 94fd63f33067..a56cccbcf5d7 100644
--- a/examples/text_to_image/README.md
+++ b/examples/text_to_image/README.md
@@ -101,8 +101,8 @@ accelerate launch --mixed_precision="fp16" train_text_to_image.py \
 
 Once the training is finished the model will be saved in the `output_dir` specified in the command. In this example it's `sd-pokemon-model`. To load the fine-tuned model for inference just pass that path to `StableDiffusionPipeline`
 
-
 ```python
+import torch
 from diffusers import StableDiffusionPipeline
 
 model_path = "path_to_saved_model"
@@ -114,12 +114,13 @@ image.save("yoda-pokemon.png")
 ```
 
 Checkpoints only save the unet, so to run inference from a checkpoint, just load the unet
+
 ```python
+import torch
 from diffusers import StableDiffusionPipeline, UNet2DConditionModel
 
 model_path = "path_to_saved_model"
-
-unet = UNet2DConditionModel.from_pretrained(model_path + "/checkpoint-<N>/unet")
+unet = UNet2DConditionModel.from_pretrained(model_path + "/checkpoint-<N>/unet", torch_dtype=torch.float16)
 
 pipe = StableDiffusionPipeline.from_pretrained("<initial model>", unet=unet, torch_dtype=torch.float16)
 pipe.to("cuda")

From 74558ff65b0ed169cc5f36b05d95b4cb320e907c Mon Sep 17 00:00:00 2001
From: Omar Sanseviero <osanseviero@gmail.com>
Date: Mon, 18 Dec 2023 11:06:16 +0100
Subject: [PATCH 17/30] Nit fix to training params (#6200)

---
 docs/source/en/training/unconditional_training.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/training/unconditional_training.md b/docs/source/en/training/unconditional_training.md
index 97b644883cae..a124d5f32e1f 100644
--- a/docs/source/en/training/unconditional_training.md
+++ b/docs/source/en/training/unconditional_training.md
@@ -186,7 +186,7 @@ accelerate launch train_unconditional.py \
 If you're training with more than one GPU, add the `--multi_gpu` parameter to the training command:
 
 ```bash
-accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \
+accelerate launch --multi_gpu train_unconditional.py \
   --dataset_name="huggan/flowers-102-categories" \
   --output_dir="ddpm-ema-flowers-64" \
   --mixed_precision="fp16" \

From b98b314b7aa1b95829b316fb58aa9cabbb6fd2a6 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 18 Dec 2023 15:52:43 +0530
Subject: [PATCH 18/30] [Training] remove depcreated method from lora scripts.
 (#6207)

remove depcreated method from lora scripts.
---
 examples/dreambooth/train_dreambooth_lora.py  | 33 -------------------
 .../dreambooth/train_dreambooth_lora_sdxl.py  | 33 -------------------
 .../text_to_image/train_text_to_image_lora.py | 33 -------------------
 .../train_text_to_image_lora_sdxl.py          | 33 -------------------
 4 files changed, 132 deletions(-)

diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index 60213dd75685..55ef2bbeb8eb 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -64,39 +64,6 @@
 logger = get_logger(__name__)
 
 
-# TODO: This function should be removed once training scripts are rewritten in PEFT
-def text_encoder_lora_state_dict(text_encoder):
-    state_dict = {}
-
-    def text_encoder_attn_modules(text_encoder):
-        from transformers import CLIPTextModel, CLIPTextModelWithProjection
-
-        attn_modules = []
-
-        if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
-            for i, layer in enumerate(text_encoder.text_model.encoder.layers):
-                name = f"text_model.encoder.layers.{i}.self_attn"
-                mod = layer.self_attn
-                attn_modules.append((name, mod))
-
-        return attn_modules
-
-    for name, module in text_encoder_attn_modules(text_encoder):
-        for k, v in module.q_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.q_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.k_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.k_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.v_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.v_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.out_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.out_proj.lora_linear_layer.{k}"] = v
-
-    return state_dict
-
-
 def save_model_card(
     repo_id: str,
     images=None,
diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py
index c8a9a6ad4812..c3a78eae34d7 100644
--- a/examples/dreambooth/train_dreambooth_lora_sdxl.py
+++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py
@@ -64,39 +64,6 @@
 logger = get_logger(__name__)
 
 
-# TODO: This function should be removed once training scripts are rewritten in PEFT
-def text_encoder_lora_state_dict(text_encoder):
-    state_dict = {}
-
-    def text_encoder_attn_modules(text_encoder):
-        from transformers import CLIPTextModel, CLIPTextModelWithProjection
-
-        attn_modules = []
-
-        if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
-            for i, layer in enumerate(text_encoder.text_model.encoder.layers):
-                name = f"text_model.encoder.layers.{i}.self_attn"
-                mod = layer.self_attn
-                attn_modules.append((name, mod))
-
-        return attn_modules
-
-    for name, module in text_encoder_attn_modules(text_encoder):
-        for k, v in module.q_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.q_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.k_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.k_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.v_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.v_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.out_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.out_proj.lora_linear_layer.{k}"] = v
-
-    return state_dict
-
-
 def save_model_card(
     repo_id: str,
     images=None,
diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py
index b63500f906a8..0af2c1b2a5b4 100644
--- a/examples/text_to_image/train_text_to_image_lora.py
+++ b/examples/text_to_image/train_text_to_image_lora.py
@@ -54,39 +54,6 @@
 logger = get_logger(__name__, log_level="INFO")
 
 
-# TODO: This function should be removed once training scripts are rewritten in PEFT
-def text_encoder_lora_state_dict(text_encoder):
-    state_dict = {}
-
-    def text_encoder_attn_modules(text_encoder):
-        from transformers import CLIPTextModel, CLIPTextModelWithProjection
-
-        attn_modules = []
-
-        if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
-            for i, layer in enumerate(text_encoder.text_model.encoder.layers):
-                name = f"text_model.encoder.layers.{i}.self_attn"
-                mod = layer.self_attn
-                attn_modules.append((name, mod))
-
-        return attn_modules
-
-    for name, module in text_encoder_attn_modules(text_encoder):
-        for k, v in module.q_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.q_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.k_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.k_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.v_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.v_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.out_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.out_proj.lora_linear_layer.{k}"] = v
-
-    return state_dict
-
-
 def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None):
     img_str = ""
     for i, image in enumerate(images):
diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py
index 2e70c77e860e..a8a41b150523 100644
--- a/examples/text_to_image/train_text_to_image_lora_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py
@@ -63,39 +63,6 @@
 logger = get_logger(__name__)
 
 
-# TODO: This function should be removed once training scripts are rewritten in PEFT
-def text_encoder_lora_state_dict(text_encoder):
-    state_dict = {}
-
-    def text_encoder_attn_modules(text_encoder):
-        from transformers import CLIPTextModel, CLIPTextModelWithProjection
-
-        attn_modules = []
-
-        if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
-            for i, layer in enumerate(text_encoder.text_model.encoder.layers):
-                name = f"text_model.encoder.layers.{i}.self_attn"
-                mod = layer.self_attn
-                attn_modules.append((name, mod))
-
-        return attn_modules
-
-    for name, module in text_encoder_attn_modules(text_encoder):
-        for k, v in module.q_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.q_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.k_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.k_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.v_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.v_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.out_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.out_proj.lora_linear_layer.{k}"] = v
-
-    return state_dict
-
-
 def save_model_card(
     repo_id: str,
     images=None,

From fcbed3fa796f83d31472db27dbfcc77f526ae3d7 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 18 Dec 2023 16:15:37 +0530
Subject: [PATCH 19/30] Fix SDXL Inpainting from single file with Refiner Model
 (#6147)

* update

* update

* update
---
 src/diffusers/loaders/single_file.py          |   4 +
 .../stable_diffusion/convert_from_ckpt.py     | 130 ++++++++----------
 2 files changed, 63 insertions(+), 71 deletions(-)

diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index a49280adfcfe..742984449e4f 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -169,10 +169,12 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         load_safety_checker = kwargs.pop("load_safety_checker", True)
         prediction_type = kwargs.pop("prediction_type", None)
         text_encoder = kwargs.pop("text_encoder", None)
+        text_encoder_2 = kwargs.pop("text_encoder_2", None)
         vae = kwargs.pop("vae", None)
         controlnet = kwargs.pop("controlnet", None)
         adapter = kwargs.pop("adapter", None)
         tokenizer = kwargs.pop("tokenizer", None)
+        tokenizer_2 = kwargs.pop("tokenizer_2", None)
 
         torch_dtype = kwargs.pop("torch_dtype", None)
 
@@ -274,8 +276,10 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             load_safety_checker=load_safety_checker,
             prediction_type=prediction_type,
             text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
             vae=vae,
             tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
             original_config_file=original_config_file,
             config_files=config_files,
             local_files_only=local_files_only,
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 761391189f8f..5aa23252b86a 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -1153,7 +1153,9 @@ def download_from_original_stable_diffusion_ckpt(
     vae_path=None,
     vae=None,
     text_encoder=None,
+    text_encoder_2=None,
     tokenizer=None,
+    tokenizer_2=None,
     config_files=None,
 ) -> DiffusionPipeline:
     """
@@ -1232,7 +1234,9 @@ def download_from_original_stable_diffusion_ckpt(
         StableDiffusionInpaintPipeline,
         StableDiffusionPipeline,
         StableDiffusionUpscalePipeline,
+        StableDiffusionXLControlNetInpaintPipeline,
         StableDiffusionXLImg2ImgPipeline,
+        StableDiffusionXLInpaintPipeline,
         StableDiffusionXLPipeline,
         StableUnCLIPImg2ImgPipeline,
         StableUnCLIPPipeline,
@@ -1339,7 +1343,11 @@ def download_from_original_stable_diffusion_ckpt(
         else:
             pipeline_class = StableDiffusionXLPipeline if model_type == "SDXL" else StableDiffusionXLImg2ImgPipeline
 
-    if num_in_channels is None and pipeline_class == StableDiffusionInpaintPipeline:
+    if num_in_channels is None and pipeline_class in [
+        StableDiffusionInpaintPipeline,
+        StableDiffusionXLInpaintPipeline,
+        StableDiffusionXLControlNetInpaintPipeline,
+    ]:
         num_in_channels = 9
     if num_in_channels is None and pipeline_class == StableDiffusionUpscalePipeline:
         num_in_channels = 7
@@ -1686,7 +1694,9 @@ def download_from_original_stable_diffusion_ckpt(
                 feature_extractor=feature_extractor,
             )
     elif model_type in ["SDXL", "SDXL-Refiner"]:
-        if model_type == "SDXL":
+        is_refiner = model_type == "SDXL-Refiner"
+
+        if (is_refiner is False) and (tokenizer is None):
             try:
                 tokenizer = CLIPTokenizer.from_pretrained(
                     "openai/clip-vit-large-patch14", local_files_only=local_files_only
@@ -1695,7 +1705,11 @@ def download_from_original_stable_diffusion_ckpt(
                 raise ValueError(
                     f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
                 )
+
+        if (is_refiner is False) and (text_encoder is None):
             text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
+
+        if tokenizer_2 is None:
             try:
                 tokenizer_2 = CLIPTokenizer.from_pretrained(
                     "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only
@@ -1705,95 +1719,69 @@ def download_from_original_stable_diffusion_ckpt(
                     f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k' with `pad_token` set to '!'."
                 )
 
+        if text_encoder_2 is None:
             config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
             config_kwargs = {"projection_dim": 1280}
-            text_encoder_2 = convert_open_clip_checkpoint(
-                checkpoint,
-                config_name,
-                prefix="conditioner.embedders.1.model.",
-                has_projection=True,
-                local_files_only=local_files_only,
-                **config_kwargs,
-            )
-
-            if is_accelerate_available():  # SBM Now move model to cpu.
-                if model_type in ["SDXL", "SDXL-Refiner"]:
-                    for param_name, param in converted_unet_checkpoint.items():
-                        set_module_tensor_to_device(unet, param_name, "cpu", value=param)
+            prefix = "conditioner.embedders.0.model." if is_refiner else "conditioner.embedders.1.model."
 
-            if controlnet:
-                pipe = pipeline_class(
-                    vae=vae,
-                    text_encoder=text_encoder,
-                    tokenizer=tokenizer,
-                    text_encoder_2=text_encoder_2,
-                    tokenizer_2=tokenizer_2,
-                    unet=unet,
-                    controlnet=controlnet,
-                    scheduler=scheduler,
-                    force_zeros_for_empty_prompt=True,
-                )
-            elif adapter:
-                pipe = pipeline_class(
-                    vae=vae,
-                    text_encoder=text_encoder,
-                    tokenizer=tokenizer,
-                    text_encoder_2=text_encoder_2,
-                    tokenizer_2=tokenizer_2,
-                    unet=unet,
-                    adapter=adapter,
-                    scheduler=scheduler,
-                    force_zeros_for_empty_prompt=True,
-                )
-            else:
-                pipe = pipeline_class(
-                    vae=vae,
-                    text_encoder=text_encoder,
-                    tokenizer=tokenizer,
-                    text_encoder_2=text_encoder_2,
-                    tokenizer_2=tokenizer_2,
-                    unet=unet,
-                    scheduler=scheduler,
-                    force_zeros_for_empty_prompt=True,
-                )
-        else:
-            tokenizer = None
-            text_encoder = None
-            try:
-                tokenizer_2 = CLIPTokenizer.from_pretrained(
-                    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only
-                )
-            except Exception:
-                raise ValueError(
-                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k' with `pad_token` set to '!'."
-                )
-            config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
-            config_kwargs = {"projection_dim": 1280}
             text_encoder_2 = convert_open_clip_checkpoint(
                 checkpoint,
                 config_name,
-                prefix="conditioner.embedders.0.model.",
+                prefix=prefix,
                 has_projection=True,
                 local_files_only=local_files_only,
                 **config_kwargs,
             )
 
-            if is_accelerate_available():  # SBM Now move model to cpu.
-                if model_type in ["SDXL", "SDXL-Refiner"]:
-                    for param_name, param in converted_unet_checkpoint.items():
-                        set_module_tensor_to_device(unet, param_name, "cpu", value=param)
+        if is_accelerate_available():  # SBM Now move model to cpu.
+            for param_name, param in converted_unet_checkpoint.items():
+                set_module_tensor_to_device(unet, param_name, "cpu", value=param)
 
-            pipe = StableDiffusionXLImg2ImgPipeline(
+        if controlnet:
+            pipe = pipeline_class(
                 vae=vae,
                 text_encoder=text_encoder,
                 tokenizer=tokenizer,
                 text_encoder_2=text_encoder_2,
                 tokenizer_2=tokenizer_2,
                 unet=unet,
+                controlnet=controlnet,
+                scheduler=scheduler,
+                force_zeros_for_empty_prompt=True,
+            )
+        elif adapter:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                text_encoder_2=text_encoder_2,
+                tokenizer_2=tokenizer_2,
+                unet=unet,
+                adapter=adapter,
                 scheduler=scheduler,
-                requires_aesthetics_score=True,
-                force_zeros_for_empty_prompt=False,
+                force_zeros_for_empty_prompt=True,
             )
+
+        else:
+            pipeline_kwargs = {
+                "vae": vae,
+                "text_encoder": text_encoder,
+                "tokenizer": tokenizer,
+                "text_encoder_2": text_encoder_2,
+                "tokenizer_2": tokenizer_2,
+                "unet": unet,
+                "scheduler": scheduler,
+            }
+
+            if (pipeline_class == StableDiffusionXLImg2ImgPipeline) or (
+                pipeline_class == StableDiffusionXLInpaintPipeline
+            ):
+                pipeline_kwargs.update({"requires_aesthetics_score": is_refiner})
+
+            if is_refiner:
+                pipeline_kwargs.update({"force_zeros_for_empty_prompt": False})
+
+            pipe = pipeline_class(**pipeline_kwargs)
     else:
         text_config = create_ldm_bert_config(original_config)
         text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)

From 6976cab7caa959bff46896c52ec03d9d414fc689 Mon Sep 17 00:00:00 2001
From: d8ahazard <d8ahazard@gmail.com>
Date: Mon, 18 Dec 2023 04:51:20 -0600
Subject: [PATCH 20/30] Fix possible re-conversion issues after extracting from
 safetensors (#6097)

* Fix possible re-conversion issues after extracting from diffusers

Properly rename specific vae keys.

* Whoops
---
 ...rt_diffusers_to_original_stable_diffusion.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/scripts/convert_diffusers_to_original_stable_diffusion.py b/scripts/convert_diffusers_to_original_stable_diffusion.py
index 9da45211551e..cc90a5131732 100644
--- a/scripts/convert_diffusers_to_original_stable_diffusion.py
+++ b/scripts/convert_diffusers_to_original_stable_diffusion.py
@@ -159,6 +159,14 @@ def convert_unet_state_dict(unet_state_dict):
     ("proj_out.", "proj_attn."),
 ]
 
+# This is probably not the most ideal solution, but it does work.
+vae_extra_conversion_map = [
+    ("to_q", "q"),
+    ("to_k", "k"),
+    ("to_v", "v"),
+    ("to_out.0", "proj_out"),
+]
+
 
 def reshape_weight_for_sd(w):
     # convert HF linear weights to SD conv2d weights
@@ -178,11 +186,20 @@ def convert_vae_state_dict(vae_state_dict):
             mapping[k] = v
     new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
     weights_to_convert = ["q", "k", "v", "proj_out"]
+    keys_to_rename = {}
     for k, v in new_state_dict.items():
         for weight_name in weights_to_convert:
             if f"mid.attn_1.{weight_name}.weight" in k:
                 print(f"Reshaping {k} for SD format")
                 new_state_dict[k] = reshape_weight_for_sd(v)
+        for weight_name, real_weight_name in vae_extra_conversion_map:
+            if f"mid.attn_1.{weight_name}.weight" in k or f"mid.attn_1.{weight_name}.bias" in k:
+                keys_to_rename[k] = k.replace(weight_name, real_weight_name)
+    for k, v in keys_to_rename.items():
+        if k in new_state_dict:
+            print(f"Renaming {k} to {v}")
+            new_state_dict[v] = reshape_weight_for_sd(new_state_dict[k])
+            del new_state_dict[k]
     return new_state_dict
 
 

From d816bcb5e836a2421f33e3f9ad10782993b9aecd Mon Sep 17 00:00:00 2001
From: Abin Thomas <abinthomasonline@gmail.com>
Date: Mon, 18 Dec 2023 22:42:28 +0530
Subject: [PATCH 21/30] Fix t2i. blog url (#6205)

---
 docs/source/en/training/t2i_adapters.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/training/t2i_adapters.md b/docs/source/en/training/t2i_adapters.md
index 9d4f292b1d3f..0f65ad8ed31d 100644
--- a/docs/source/en/training/t2i_adapters.md
+++ b/docs/source/en/training/t2i_adapters.md
@@ -224,4 +224,4 @@ image.save("./output.png")
 
 Congratulations on training a T2I-Adapter model! 🎉 To learn more:
 
-- Read the [Efficient Controllable Generation for SDXL with T2I-Adapters](https://www.cs.cmu.edu/~custom-diffusion/) blog post to learn more details about the experimental results from the T2I-Adapter team.
+- Read the [Efficient Controllable Generation for SDXL with T2I-Adapters](https://huggingface.co/blog/t2i-sdxl-adapters) blog post to learn more details about the experimental results from the T2I-Adapter team.

From cce1fe2d41196eeeeda583660bea3101751c70e5 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 18 Dec 2023 18:21:09 +0100
Subject: [PATCH 22/30] [Text-to-Video] Clean up pipeline (#6213)

* make style

* make style

* make style

* make style
---
 .../pipeline_text_to_video_zero.py            | 340 ++++++++++++-
 .../pipeline_text_to_video_zero_sdxl.py       | 477 +++++++++++++++++-
 .../test_text_to_video_zero_sdxl.py           |   2 +-
 3 files changed, 800 insertions(+), 19 deletions(-)

diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
index 0f9ffbebdcf6..64bdb476fe2d 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
@@ -1,4 +1,5 @@
 import copy
+import inspect
 from dataclasses import dataclass
 from typing import Callable, List, Optional, Union
 
@@ -9,11 +10,18 @@
 from torch.nn.functional import grid_sample
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline, StableDiffusionSafetyChecker
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import BaseOutput
-from diffusers.utils.torch_utils import randn_tensor
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from ..stable_diffusion import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
 def rearrange_0(tensor, f):
@@ -273,7 +281,7 @@ def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_s
     return warped_latents
 
 
-class TextToVideoZeroPipeline(StableDiffusionPipeline):
+class TextToVideoZeroPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
     r"""
     Pipeline for zero-shot text-to-video generation using Stable Diffusion.
 
@@ -311,8 +319,15 @@ def __init__(
         feature_extractor: CLIPImageProcessor,
         requires_safety_checker: bool = True,
     ):
-        super().__init__(
-            vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
         )
         processor = (
             CrossFrameAttnProcessor2_0(batch_size=2)
@@ -321,6 +336,18 @@ def __init__(
         )
         self.unet.set_attn_processor(processor)
 
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
     def forward_loop(self, x_t0, t0, t1, generator):
         """
         Perform DDPM forward process from time t0 to t1. This is the same as adding noise with corresponding variance.
@@ -420,6 +447,77 @@ def backward_loop(
                         callback(step_idx, t, latents)
         return latents.clone().detach()
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
     @torch.no_grad()
     def __call__(
         self,
@@ -539,9 +637,10 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # Encode input prompt
-        prompt_embeds = self._encode_prompt(
+        prompt_embeds_tuple = self.encode_prompt(
             prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
         )
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
 
         # Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -645,3 +744,226 @@ def __call__(
             return (image, has_nsfw_concept)
 
         return TextToVideoPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
index fd020841494c..c31fa4f90cea 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
@@ -1,4 +1,5 @@
 import copy
+import inspect
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
@@ -15,11 +16,35 @@
     CLIPVisionModelWithProjection,
 )
 
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipeline
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import BaseOutput
-from diffusers.utils.torch_utils import randn_tensor
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    is_invisible_watermark_available,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
 # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.rearrange_0
@@ -300,7 +325,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
     return noise_cfg
 
 
-class TextToVideoZeroSDXLPipeline(StableDiffusionXLPipeline):
+class TextToVideoZeroSDXLPipeline(
+    DiffusionPipeline,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+):
     r"""
     Pipeline for zero-shot text-to-video generation using Stable Diffusion XL.
 
@@ -332,6 +361,16 @@ class TextToVideoZeroSDXLPipeline(StableDiffusionXLPipeline):
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
 
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+
     def __init__(
         self,
         vae: AutoencoderKL,
@@ -346,7 +385,8 @@ def __init__(
         force_zeros_for_empty_prompt: bool = True,
         add_watermarker: Optional[bool] = None,
     ):
-        super().__init__(
+        super().__init__()
+        self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
             text_encoder_2=text_encoder_2,
@@ -356,16 +396,435 @@ def __init__(
             scheduler=scheduler,
             image_encoder=image_encoder,
             feature_extractor=feature_extractor,
-            force_zeros_for_empty_prompt=force_zeros_for_empty_prompt,
-            add_watermarker=add_watermarker,
         )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
         processor = (
             CrossFrameAttnProcessor2_0(batch_size=2)
             if hasattr(F, "scaled_dot_product_attention")
             else CrossFrameAttnProcessor(batch_size=2)
         )
+
         self.unet.set_attn_processor(processor)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
     # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoZeroPipeline.forward_loop
     def forward_loop(self, x_t0, t0, t1, generator):
         """
diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py
index 54faa9de6d62..510a8b482e8c 100644
--- a/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py
+++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py
@@ -383,7 +383,7 @@ def test_xformers_attention_forwardGenerator_pass(self):
 class TextToVideoZeroSDXLPipelineSlowTests(unittest.TestCase):
     def test_full_model(self):
         model_id = "stabilityai/stable-diffusion-xl-base-1.0"
-        pipe = self.pipeline_class.from_pretrained(
+        pipe = TextToVideoZeroSDXLPipeline.from_pretrained(
             model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True
         )
         pipe.enable_model_cpu_offload()

From 8d891e6e1bc02fb42a891d95cfa8a315dadb3b5a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 18 Dec 2023 18:21:17 +0100
Subject: [PATCH 23/30] [Torch Compile] Fix torch compile for svd vae (#6217)

---
 .../pipeline_stable_video_diffusion.py                       | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index a82f5379e71a..988623ca653e 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -25,7 +25,7 @@
 from ...models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
 from ...schedulers import EulerDiscreteScheduler
 from ...utils import BaseOutput, logging
-from ...utils.torch_utils import randn_tensor
+from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 
 
@@ -211,7 +211,8 @@ def decode_latents(self, latents, num_frames, decode_chunk_size=14):
 
         latents = 1 / self.vae.config.scaling_factor * latents
 
-        accepts_num_frames = "num_frames" in set(inspect.signature(self.vae.forward).parameters.keys())
+        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
+        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
 
         # decode decode_chunk_size frames at a time to avoid OOM
         frames = []

From a0c54828a170a03947dc6f7f44aa6548d288f36b Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 18 Dec 2023 23:08:29 +0530
Subject: [PATCH 24/30] Deprecate Pipelines (#6169)

* deprecate pipe

* make style

* update

* add deprecation message

* format

* remove tests for deprecated pipelines

* remove deprecation message

* make style

* fix copies

* clean up

* clean

* clean

* clean

* clean up

* clean up

* clean up toctree

* clean up

---------

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/_toctree.yml                   |  28 --
 docs/source/en/api/pipelines/alt_diffusion.md |  47 ---
 .../en/api/pipelines/audio_diffusion.md       |  35 --
 .../en/api/pipelines/cycle_diffusion.md       |  33 --
 .../api/pipelines/latent_diffusion_uncond.md  |  35 --
 docs/source/en/api/pipelines/model_editing.md |  35 --
 docs/source/en/api/pipelines/paradigms.md     |  51 ---
 docs/source/en/api/pipelines/pix2pix_zero.md  | 289 ----------------
 docs/source/en/api/pipelines/pndm.md          |  35 --
 docs/source/en/api/pipelines/repaint.md       |  37 ---
 docs/source/en/api/pipelines/score_sde_ve.md  |  35 --
 .../en/api/pipelines/spectrogram_diffusion.md |  37 ---
 .../en/api/pipelines/stochastic_karras_ve.md  |  33 --
 .../en/api/pipelines/versatile_diffusion.md   |  54 ---
 docs/source/en/api/pipelines/vq_diffusion.md  |  35 --
 src/diffusers/pipelines/__init__.py           | 116 +++----
 src/diffusers/pipelines/deprecated/README.md  |   3 +
 .../pipelines/deprecated/__init__.py          | 153 +++++++++
 .../alt_diffusion/__init__.py                 |   6 +-
 .../alt_diffusion/modeling_roberta_series.py  |   0
 .../alt_diffusion/pipeline_alt_diffusion.py   |  23 +-
 .../pipeline_alt_diffusion_img2img.py         |  23 +-
 .../alt_diffusion/pipeline_output.py          |   2 +-
 .../audio_diffusion/__init__.py               |   2 +-
 .../{ => deprecated}/audio_diffusion/mel.py   |   4 +-
 .../pipeline_audio_diffusion.py               |   8 +-
 .../latent_diffusion_uncond/__init__.py       |   2 +-
 .../pipeline_latent_diffusion_uncond.py       |   8 +-
 .../{ => deprecated}/pndm/__init__.py         |   2 +-
 .../{ => deprecated}/pndm/pipeline_pndm.py    |   8 +-
 .../{ => deprecated}/repaint/__init__.py      |   2 +-
 .../repaint/pipeline_repaint.py               |  10 +-
 .../{ => deprecated}/score_sde_ve/__init__.py |   2 +-
 .../score_sde_ve/pipeline_score_sde_ve.py     |   8 +-
 .../spectrogram_diffusion/__init__.py         |  12 +-
 .../continuous_encoder.py                     |   4 +-
 .../spectrogram_diffusion/midi_utils.py       |   2 +-
 .../spectrogram_diffusion/notes_encoder.py    |   4 +-
 .../pipeline_spectrogram_diffusion.py         |  12 +-
 .../stable_diffusion_variants/__init__.py     |  55 ++++
 .../pipeline_cycle_diffusion.py               |  22 +-
 ...ne_onnx_stable_diffusion_inpaint_legacy.py |  12 +-
 ...ipeline_stable_diffusion_inpaint_legacy.py |  22 +-
 ...pipeline_stable_diffusion_model_editing.py |  22 +-
 .../pipeline_stable_diffusion_paradigms.py    |  20 +-
 .../pipeline_stable_diffusion_pix2pix_zero.py |  24 +-
 .../stochastic_karras_ve/__init__.py          |   2 +-
 .../pipeline_stochastic_karras_ve.py          |   8 +-
 .../versatile_diffusion/__init__.py           |   6 +-
 .../versatile_diffusion/modeling_text_unet.py |  20 +-
 .../pipeline_versatile_diffusion.py           |   8 +-
 ...ipeline_versatile_diffusion_dual_guided.py |  12 +-
 ...ine_versatile_diffusion_image_variation.py |  12 +-
 ...eline_versatile_diffusion_text_to_image.py |  12 +-
 .../{ => deprecated}/vq_diffusion/__init__.py |   6 +-
 .../vq_diffusion/pipeline_vq_diffusion.py     |  10 +-
 .../pipelines/stable_diffusion/__init__.py    |  16 -
 .../pipeline_stable_diffusion_diffedit.py     |   1 -
 tests/pipelines/altdiffusion/__init__.py      |   0
 .../altdiffusion/test_alt_diffusion.py        | 260 ---------------
 .../test_alt_diffusion_img2img.py             | 309 ------------------
 tests/pipelines/audio_diffusion/__init__.py   |   0
 .../audio_diffusion/test_audio_diffusion.py   | 203 ------------
 .../test_latent_diffusion_uncond.py           | 116 -------
 tests/pipelines/repaint/__init__.py           |   0
 tests/pipelines/repaint/test_repaint.py       | 169 ----------
 tests/pipelines/score_sde_ve/__init__.py      |   0
 .../score_sde_ve/test_score_sde_ve.py         |  91 ------
 .../spectrogram_diffusion/__init__.py         |   0
 .../test_spectrogram_diffusion.py             | 246 --------------
 .../pipelines/versatile_diffusion/__init__.py |   0
 .../test_versatile_diffusion_dual_guided.py   | 107 ------
 ...est_versatile_diffusion_image_variation.py |  57 ----
 .../test_versatile_diffusion_mega.py          | 129 --------
 .../test_versatile_diffusion_text_to_image.py |  87 -----
 tests/pipelines/vq_diffusion/__init__.py      |   0
 .../vq_diffusion/test_vq_diffusion.py         | 227 -------------
 77 files changed, 451 insertions(+), 3075 deletions(-)
 delete mode 100644 docs/source/en/api/pipelines/alt_diffusion.md
 delete mode 100644 docs/source/en/api/pipelines/audio_diffusion.md
 delete mode 100644 docs/source/en/api/pipelines/cycle_diffusion.md
 delete mode 100644 docs/source/en/api/pipelines/latent_diffusion_uncond.md
 delete mode 100644 docs/source/en/api/pipelines/model_editing.md
 delete mode 100644 docs/source/en/api/pipelines/paradigms.md
 delete mode 100644 docs/source/en/api/pipelines/pix2pix_zero.md
 delete mode 100644 docs/source/en/api/pipelines/pndm.md
 delete mode 100644 docs/source/en/api/pipelines/repaint.md
 delete mode 100644 docs/source/en/api/pipelines/score_sde_ve.md
 delete mode 100644 docs/source/en/api/pipelines/spectrogram_diffusion.md
 delete mode 100644 docs/source/en/api/pipelines/stochastic_karras_ve.md
 delete mode 100644 docs/source/en/api/pipelines/versatile_diffusion.md
 delete mode 100644 docs/source/en/api/pipelines/vq_diffusion.md
 create mode 100644 src/diffusers/pipelines/deprecated/README.md
 create mode 100644 src/diffusers/pipelines/deprecated/__init__.py
 rename src/diffusers/pipelines/{ => deprecated}/alt_diffusion/__init__.py (91%)
 rename src/diffusers/pipelines/{ => deprecated}/alt_diffusion/modeling_roberta_series.py (100%)
 rename src/diffusers/pipelines/{ => deprecated}/alt_diffusion/pipeline_alt_diffusion.py (98%)
 rename src/diffusers/pipelines/{ => deprecated}/alt_diffusion/pipeline_alt_diffusion_img2img.py (98%)
 rename src/diffusers/pipelines/{ => deprecated}/alt_diffusion/pipeline_output.py (97%)
 rename src/diffusers/pipelines/{ => deprecated}/audio_diffusion/__init__.py (88%)
 rename src/diffusers/pipelines/{ => deprecated}/audio_diffusion/mel.py (97%)
 rename src/diffusers/pipelines/{ => deprecated}/audio_diffusion/pipeline_audio_diffusion.py (98%)
 rename src/diffusers/pipelines/{ => deprecated}/latent_diffusion_uncond/__init__.py (87%)
 rename src/diffusers/pipelines/{ => deprecated}/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py (96%)
 rename src/diffusers/pipelines/{ => deprecated}/pndm/__init__.py (86%)
 rename src/diffusers/pipelines/{ => deprecated}/pndm/pipeline_pndm.py (95%)
 rename src/diffusers/pipelines/{ => deprecated}/repaint/__init__.py (86%)
 rename src/diffusers/pipelines/{ => deprecated}/repaint/pipeline_repaint.py (97%)
 rename src/diffusers/pipelines/{ => deprecated}/score_sde_ve/__init__.py (87%)
 rename src/diffusers/pipelines/{ => deprecated}/score_sde_ve/pipeline_score_sde_ve.py (95%)
 rename src/diffusers/pipelines/{ => deprecated}/spectrogram_diffusion/__init__.py (85%)
 rename src/diffusers/pipelines/{ => deprecated}/spectrogram_diffusion/continuous_encoder.py (96%)
 rename src/diffusers/pipelines/{ => deprecated}/spectrogram_diffusion/midi_utils.py (99%)
 rename src/diffusers/pipelines/{ => deprecated}/spectrogram_diffusion/notes_encoder.py (96%)
 rename src/diffusers/pipelines/{ => deprecated}/spectrogram_diffusion/pipeline_spectrogram_diffusion.py (97%)
 create mode 100644 src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py
 rename src/diffusers/pipelines/{stable_diffusion => deprecated/stable_diffusion_variants}/pipeline_cycle_diffusion.py (98%)
 rename src/diffusers/pipelines/{stable_diffusion => deprecated/stable_diffusion_variants}/pipeline_onnx_stable_diffusion_inpaint_legacy.py (98%)
 rename src/diffusers/pipelines/{stable_diffusion => deprecated/stable_diffusion_variants}/pipeline_stable_diffusion_inpaint_legacy.py (98%)
 rename src/diffusers/pipelines/{stable_diffusion => deprecated/stable_diffusion_variants}/pipeline_stable_diffusion_model_editing.py (98%)
 rename src/diffusers/pipelines/{stable_diffusion => deprecated/stable_diffusion_variants}/pipeline_stable_diffusion_paradigms.py (98%)
 rename src/diffusers/pipelines/{stable_diffusion => deprecated/stable_diffusion_variants}/pipeline_stable_diffusion_pix2pix_zero.py (98%)
 rename src/diffusers/pipelines/{ => deprecated}/stochastic_karras_ve/__init__.py (87%)
 rename src/diffusers/pipelines/{ => deprecated}/stochastic_karras_ve/pipeline_stochastic_karras_ve.py (96%)
 rename src/diffusers/pipelines/{ => deprecated}/versatile_diffusion/__init__.py (94%)
 rename src/diffusers/pipelines/{ => deprecated}/versatile_diffusion/modeling_text_unet.py (99%)
 rename src/diffusers/pipelines/{ => deprecated}/versatile_diffusion/pipeline_versatile_diffusion.py (99%)
 rename src/diffusers/pipelines/{ => deprecated}/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py (98%)
 rename src/diffusers/pipelines/{ => deprecated}/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py (98%)
 rename src/diffusers/pipelines/{ => deprecated}/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py (98%)
 rename src/diffusers/pipelines/{ => deprecated}/vq_diffusion/__init__.py (90%)
 rename src/diffusers/pipelines/{ => deprecated}/vq_diffusion/pipeline_vq_diffusion.py (98%)
 delete mode 100644 tests/pipelines/altdiffusion/__init__.py
 delete mode 100644 tests/pipelines/altdiffusion/test_alt_diffusion.py
 delete mode 100644 tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
 delete mode 100644 tests/pipelines/audio_diffusion/__init__.py
 delete mode 100644 tests/pipelines/audio_diffusion/test_audio_diffusion.py
 delete mode 100644 tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
 delete mode 100644 tests/pipelines/repaint/__init__.py
 delete mode 100644 tests/pipelines/repaint/test_repaint.py
 delete mode 100644 tests/pipelines/score_sde_ve/__init__.py
 delete mode 100644 tests/pipelines/score_sde_ve/test_score_sde_ve.py
 delete mode 100644 tests/pipelines/spectrogram_diffusion/__init__.py
 delete mode 100644 tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
 delete mode 100644 tests/pipelines/versatile_diffusion/__init__.py
 delete mode 100644 tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
 delete mode 100644 tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
 delete mode 100644 tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
 delete mode 100644 tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
 delete mode 100644 tests/pipelines/vq_diffusion/__init__.py
 delete mode 100644 tests/pipelines/vq_diffusion/test_vq_diffusion.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index eab726d0b616..62588bf4abb8 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -244,14 +244,10 @@
   - sections:
     - local: api/pipelines/overview
       title: Overview
-    - local: api/pipelines/alt_diffusion
-      title: AltDiffusion
     - local: api/pipelines/animatediff
       title: AnimateDiff
     - local: api/pipelines/attend_and_excite
       title: Attend-and-Excite
-    - local: api/pipelines/audio_diffusion
-      title: Audio Diffusion
     - local: api/pipelines/audioldm
       title: AudioLDM
     - local: api/pipelines/audioldm2
@@ -270,8 +266,6 @@
       title: ControlNet-XS
     - local: api/pipelines/controlnetxs_sdxl
       title: ControlNet-XS with Stable Diffusion XL
-    - local: api/pipelines/cycle_diffusion
-      title: Cycle Diffusion
     - local: api/pipelines/dance_diffusion
       title: Dance Diffusion
     - local: api/pipelines/ddim
@@ -302,26 +296,14 @@
       title: MusicLDM
     - local: api/pipelines/paint_by_example
       title: Paint by Example
-    - local: api/pipelines/paradigms
-      title: Parallel Sampling of Diffusion Models
-    - local: api/pipelines/pix2pix_zero
-      title: Pix2Pix Zero
     - local: api/pipelines/pixart
       title: PixArt-α
-    - local: api/pipelines/pndm
-      title: PNDM
-    - local: api/pipelines/repaint
-      title: RePaint
-    - local: api/pipelines/score_sde_ve
-      title: Score SDE VE
     - local: api/pipelines/self_attention_guidance
       title: Self-Attention Guidance
     - local: api/pipelines/semantic_stable_diffusion
       title: Semantic Guidance
     - local: api/pipelines/shap_e
       title: Shap-E
-    - local: api/pipelines/spectrogram_diffusion
-      title: Spectrogram Diffusion
     - sections:
       - local: api/pipelines/stable_diffusion/overview
         title: Overview
@@ -356,26 +338,16 @@
       title: Stable Diffusion
     - local: api/pipelines/stable_unclip
       title: Stable unCLIP
-    - local: api/pipelines/stochastic_karras_ve
-      title: Stochastic Karras VE
-    - local: api/pipelines/model_editing
-      title: Text-to-image model editing
     - local: api/pipelines/text_to_video
       title: Text-to-video
     - local: api/pipelines/text_to_video_zero
       title: Text2Video-Zero
     - local: api/pipelines/unclip
       title: unCLIP
-    - local: api/pipelines/latent_diffusion_uncond
-      title: Unconditional Latent Diffusion
     - local: api/pipelines/unidiffuser
       title: UniDiffuser
     - local: api/pipelines/value_guided_sampling
       title: Value-guided sampling
-    - local: api/pipelines/versatile_diffusion
-      title: Versatile Diffusion
-    - local: api/pipelines/vq_diffusion
-      title: VQ Diffusion
     - local: api/pipelines/wuerstchen
       title: Wuerstchen
     title: Pipelines
diff --git a/docs/source/en/api/pipelines/alt_diffusion.md b/docs/source/en/api/pipelines/alt_diffusion.md
deleted file mode 100644
index d0326affbb63..000000000000
--- a/docs/source/en/api/pipelines/alt_diffusion.md
+++ /dev/null
@@ -1,47 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# AltDiffusion
-
-AltDiffusion was proposed in [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://huggingface.co/papers/2211.06679) by Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong Ye, Qinghong Yang, Ledell Wu.
-
-The abstract from the paper is:
-
-*In this work, we present a conceptually simple and effective method to train a strong bilingual/multilingual multimodal representation model. Starting from the pre-trained multimodal representation model CLIP released by OpenAI, we altered its text encoder with a pre-trained multilingual text encoder XLM-R, and aligned both languages and image representations by a two-stage training schema consisting of teacher learning and contrastive learning. We validate our method through evaluations of a wide range of tasks. We set new state-of-the-art performances on a bunch of tasks including ImageNet-CN, Flicker30k-CN, COCO-CN and XTD. Further, we obtain very close performances with CLIP on almost all tasks, suggesting that one can simply alter the text encoder in CLIP for extended capabilities such as multilingual understanding. Our models and code are available at [this https URL](https://github.com/FlagAI-Open/FlagAI).*
-
-## Tips
-
-`AltDiffusion` is conceptually the same as [Stable Diffusion](./stable_diffusion/overview).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## AltDiffusionPipeline
-
-[[autodoc]] AltDiffusionPipeline
-	- all
-	- __call__
-
-## AltDiffusionImg2ImgPipeline
-
-[[autodoc]] AltDiffusionImg2ImgPipeline
-	- all
-	- __call__
-
-## AltDiffusionPipelineOutput
-
-[[autodoc]] pipelines.alt_diffusion.AltDiffusionPipelineOutput
-	- all
-	- __call__
diff --git a/docs/source/en/api/pipelines/audio_diffusion.md b/docs/source/en/api/pipelines/audio_diffusion.md
deleted file mode 100644
index 3d140fe202a6..000000000000
--- a/docs/source/en/api/pipelines/audio_diffusion.md
+++ /dev/null
@@ -1,35 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Audio Diffusion
-
-[Audio Diffusion](https://github.com/teticio/audio-diffusion) is by Robert Dargavel Smith, and it leverages the recent advances in image generation from diffusion models by converting audio samples to and from Mel spectrogram images.
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## AudioDiffusionPipeline
-[[autodoc]] AudioDiffusionPipeline
-	- all
-	- __call__
-
-## AudioPipelineOutput
-[[autodoc]] pipelines.AudioPipelineOutput
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
-
-## Mel
-[[autodoc]] Mel
diff --git a/docs/source/en/api/pipelines/cycle_diffusion.md b/docs/source/en/api/pipelines/cycle_diffusion.md
deleted file mode 100644
index 13ada0594a6a..000000000000
--- a/docs/source/en/api/pipelines/cycle_diffusion.md
+++ /dev/null
@@ -1,33 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Cycle Diffusion
-
-Cycle Diffusion is a text guided image-to-image generation model proposed in [Unifying Diffusion Models' Latent Space, with Applications to CycleDiffusion and Guidance](https://huggingface.co/papers/2210.05559) by Chen Henry Wu, Fernando De la Torre.
-
-The abstract from the paper is:
-
-*Diffusion models have achieved unprecedented performance in generative modeling. The commonly-adopted formulation of the latent code of diffusion models is a sequence of gradually denoised samples, as opposed to the simpler (e.g., Gaussian) latent space of GANs, VAEs, and normalizing flows. This paper provides an alternative, Gaussian formulation of the latent space of various diffusion models, as well as an invertible DPM-Encoder that maps images into the latent space. While our formulation is purely based on the definition of diffusion models, we demonstrate several intriguing consequences. (1) Empirically, we observe that a common latent space emerges from two diffusion models trained independently on related domains. In light of this finding, we propose CycleDiffusion, which uses DPM-Encoder for unpaired image-to-image translation. Furthermore, applying CycleDiffusion to text-to-image diffusion models, we show that large-scale text-to-image diffusion models can be used as zero-shot image-to-image editors. (2) One can guide pre-trained diffusion models and GANs by controlling the latent codes in a unified, plug-and-play formulation based on energy-based models. Using the CLIP model and a face recognition model as guidance, we demonstrate that diffusion models have better coverage of low-density sub-populations and individuals than GANs. The code is publicly available at [this https URL](https://github.com/ChenWu98/cycle-diffusion).*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## CycleDiffusionPipeline
-[[autodoc]] CycleDiffusionPipeline
-	- all
-	- __call__
-
-## StableDiffusionPiplineOutput
-[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
diff --git a/docs/source/en/api/pipelines/latent_diffusion_uncond.md b/docs/source/en/api/pipelines/latent_diffusion_uncond.md
deleted file mode 100644
index 54835c2115b9..000000000000
--- a/docs/source/en/api/pipelines/latent_diffusion_uncond.md
+++ /dev/null
@@ -1,35 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Unconditional Latent Diffusion
-
-Unconditional Latent Diffusion was proposed in [High-Resolution Image Synthesis with Latent Diffusion Models](https://huggingface.co/papers/2112.10752) by Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, Björn Ommer.
-
-The abstract from the paper is:
-
-*By decomposing the image formation process into a sequential application of denoising autoencoders, diffusion models (DMs) achieve state-of-the-art synthesis results on image data and beyond. Additionally, their formulation allows for a guiding mechanism to control the image generation process without retraining. However, since these models typically operate directly in pixel space, optimization of powerful DMs often consumes hundreds of GPU days and inference is expensive due to sequential evaluations. To enable DM training on limited computational resources while retaining their quality and flexibility, we apply them in the latent space of powerful pretrained autoencoders. In contrast to previous work, training diffusion models on such a representation allows for the first time to reach a near-optimal point between complexity reduction and detail preservation, greatly boosting visual fidelity. By introducing cross-attention layers into the model architecture, we turn diffusion models into powerful and flexible generators for general conditioning inputs such as text or bounding boxes and high-resolution synthesis becomes possible in a convolutional manner. Our latent diffusion models (LDMs) achieve a new state of the art for image inpainting and highly competitive performance on various tasks, including unconditional image generation, semantic scene synthesis, and super-resolution, while significantly reducing computational requirements compared to pixel-based DMs.*
-
-The original codebase can be found at [CompVis/latent-diffusion](https://github.com/CompVis/latent-diffusion).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## LDMPipeline
-[[autodoc]] LDMPipeline
-	- all
-	- __call__
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
diff --git a/docs/source/en/api/pipelines/model_editing.md b/docs/source/en/api/pipelines/model_editing.md
deleted file mode 100644
index 2d94a50e4355..000000000000
--- a/docs/source/en/api/pipelines/model_editing.md
+++ /dev/null
@@ -1,35 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Text-to-image model editing
-
-[Editing Implicit Assumptions in Text-to-Image Diffusion Models](https://huggingface.co/papers/2303.08084) is by Hadas Orgad, Bahjat Kawar, and Yonatan Belinkov. This pipeline enables editing diffusion model weights, such that its assumptions of a given concept are changed. The resulting change is expected to take effect in all prompt generations related to the edited concept.
-
-The abstract from the paper is:
-
-*Text-to-image diffusion models often make implicit assumptions about the world when generating images. While some assumptions are useful (e.g., the sky is blue), they can also be outdated, incorrect, or reflective of social biases present in the training data. Thus, there is a need to control these assumptions without requiring explicit user input or costly re-training. In this work, we aim to edit a given implicit assumption in a pre-trained diffusion model. Our Text-to-Image Model Editing method, TIME for short, receives a pair of inputs: a "source" under-specified prompt for which the model makes an implicit assumption (e.g., "a pack of roses"), and a "destination" prompt that describes the same setting, but with a specified desired attribute (e.g., "a pack of blue roses"). TIME then updates the model's cross-attention layers, as these layers assign visual meaning to textual tokens. We edit the projection matrices in these layers such that the source prompt is projected close to the destination prompt. Our method is highly efficient, as it modifies a mere 2.2% of the model's parameters in under one second. To evaluate model editing approaches, we introduce TIMED (TIME Dataset), containing 147 source and destination prompt pairs from various domains. Our experiments (using Stable Diffusion) show that TIME is successful in model editing, generalizes well for related prompts unseen during editing, and imposes minimal effect on unrelated generations.*
-
-You can find additional information about model editing on the [project page](https://time-diffusion.github.io/), [original codebase](https://github.com/bahjat-kawar/time-diffusion), and try it out in a [demo](https://huggingface.co/spaces/bahjat-kawar/time-diffusion).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## StableDiffusionModelEditingPipeline
-[[autodoc]] StableDiffusionModelEditingPipeline
-	- __call__
-	- all
-
-## StableDiffusionPipelineOutput
-[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
diff --git a/docs/source/en/api/pipelines/paradigms.md b/docs/source/en/api/pipelines/paradigms.md
deleted file mode 100644
index ca2fedc796df..000000000000
--- a/docs/source/en/api/pipelines/paradigms.md
+++ /dev/null
@@ -1,51 +0,0 @@
-<!--Copyright 2023 ParaDiGMS authors and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Parallel Sampling of Diffusion Models
-
-[Parallel Sampling of Diffusion Models](https://huggingface.co/papers/2305.16317) is by Andy Shih, Suneel Belkhale, Stefano Ermon, Dorsa Sadigh, Nima Anari.
-
-The abstract from the paper is:
-
-*Diffusion models are powerful generative models but suffer from slow sampling, often taking 1000 sequential denoising steps for one sample. As a result, considerable efforts have been directed toward reducing the number of denoising steps, but these methods hurt sample quality. Instead of reducing the number of denoising steps (trading quality for speed), in this paper we explore an orthogonal approach: can we run the denoising steps in parallel (trading compute for speed)? In spite of the sequential nature of the denoising steps, we show that surprisingly it is possible to parallelize sampling via Picard iterations, by guessing the solution of future denoising steps and iteratively refining until convergence. With this insight, we present ParaDiGMS, a novel method to accelerate the sampling of pretrained diffusion models by denoising multiple steps in parallel. ParaDiGMS is the first diffusion sampling method that enables trading compute for speed and is even compatible with existing fast sampling techniques such as DDIM and DPMSolver. Using ParaDiGMS, we improve sampling speed by 2-4x across a range of robotics and image generation models, giving state-of-the-art sampling speeds of 0.2s on 100-step DiffusionPolicy and 14.6s on 1000-step StableDiffusion-v2 with no measurable degradation of task reward, FID score, or CLIP score.*
-
-The original codebase can be found at [AndyShih12/paradigms](https://github.com/AndyShih12/paradigms), and the pipeline was contributed by [AndyShih12](https://github.com/AndyShih12). ❤️
-
-## Tips
-
-This pipeline improves sampling speed by running denoising steps in parallel, at the cost of increased total FLOPs.
-Therefore, it is better to call this pipeline when running on multiple GPUs. Otherwise, without enough GPU bandwidth
-sampling may be even slower than sequential sampling.
-
-The two parameters to play with are `parallel` (batch size) and `tolerance`.
-- If it fits in memory, for a 1000-step DDPM you can aim for a batch size of around 100 (for example, 8 GPUs and `batch_per_device=12` to get `parallel=96`). A higher batch size may not fit in memory, and lower batch size gives less parallelism.
-- For tolerance, using a higher tolerance may get better speedups but can risk sample quality degradation. If there is quality degradation with the default tolerance, then use a lower tolerance like `0.001`.
-
-For a 1000-step DDPM on 8 A100 GPUs, you can expect around a 3x speedup from [`StableDiffusionParadigmsPipeline`] compared to the [`StableDiffusionPipeline`]
-by setting `parallel=80` and `tolerance=0.1`.
-
-🤗 Diffusers offers [distributed inference support](../../training/distributed_inference) for generating multiple prompts
-in parallel on multiple GPUs. But [`StableDiffusionParadigmsPipeline`] is designed for speeding up sampling of a single prompt by using multiple GPUs.
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## StableDiffusionParadigmsPipeline
-[[autodoc]] StableDiffusionParadigmsPipeline
-	- __call__
-	- all
-
-## StableDiffusionPipelineOutput
-[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
diff --git a/docs/source/en/api/pipelines/pix2pix_zero.md b/docs/source/en/api/pipelines/pix2pix_zero.md
deleted file mode 100644
index 6d7b9fb31471..000000000000
--- a/docs/source/en/api/pipelines/pix2pix_zero.md
+++ /dev/null
@@ -1,289 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Pix2Pix Zero
-
-[Zero-shot Image-to-Image Translation](https://huggingface.co/papers/2302.03027) is by Gaurav Parmar, Krishna Kumar Singh, Richard Zhang, Yijun Li, Jingwan Lu, and Jun-Yan Zhu.
-
-The abstract from the paper is:
-
-*Large-scale text-to-image generative models have shown their remarkable ability to synthesize diverse and high-quality images. However, it is still challenging to directly apply these models for editing real images for two reasons. First, it is hard for users to come up with a perfect text prompt that accurately describes every visual detail in the input image. Second, while existing models can introduce desirable changes in certain regions, they often dramatically alter the input content and introduce unexpected changes in unwanted regions. In this work, we propose pix2pix-zero, an image-to-image translation method that can preserve the content of the original image without manual prompting. We first automatically discover editing directions that reflect desired edits in the text embedding space. To preserve the general content structure after editing, we further propose cross-attention guidance, which aims to retain the cross-attention maps of the input image throughout the diffusion process. In addition, our method does not need additional training for these edits and can directly use the existing pre-trained text-to-image diffusion model. We conduct extensive experiments and show that our method outperforms existing and concurrent works for both real and synthetic image editing.*
-
-You can find additional information about Pix2Pix Zero on the [project page](https://pix2pixzero.github.io/),  [original codebase](https://github.com/pix2pixzero/pix2pix-zero), and try it out in a [demo](https://huggingface.co/spaces/pix2pix-zero-library/pix2pix-zero-demo).
-
-## Tips
-
-* The pipeline can be conditioned on real input images. Check out the code examples below to know more.
-* The pipeline exposes two arguments namely `source_embeds` and `target_embeds`
-that let you control the direction of the semantic edits in the final image to be generated. Let's say,
-you wanted to translate from "cat" to "dog". In this case, the edit direction will be "cat -> dog". To reflect
-this in the pipeline, you simply have to set the embeddings related to the phrases including "cat" to
-`source_embeds` and "dog" to `target_embeds`. Refer to the code example below for more details.
-* When you're using this pipeline from a prompt, specify the _source_ concept in the prompt. Taking
-the above example, a valid input prompt would be: "a high resolution painting of a **cat** in the style of van gogh".
-* If you wanted to reverse the direction in the example above, i.e., "dog -> cat", then it's recommended to:
-    * Swap the `source_embeds` and `target_embeds`.
-    * Change the input prompt to include "dog".
-* To learn more about how the source and target embeddings are generated, refer to the [original paper](https://arxiv.org/abs/2302.03027). Below, we also provide some directions on how to generate the embeddings.
-* Note that the quality of the outputs generated with this pipeline is dependent on how good the `source_embeds` and `target_embeds` are. Please, refer to [this discussion](#generating-source-and-target-embeddings) for some suggestions on the topic.
-
-## Available Pipelines:
-
-| Pipeline | Tasks | Demo
-|---|---|:---:|
-| [StableDiffusionPix2PixZeroPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py) | *Text-Based Image Editing* | [🤗 Space](https://huggingface.co/spaces/pix2pix-zero-library/pix2pix-zero-demo) |
-
-<!-- TODO: add Colab -->
-
-## Usage example
-
-### Based on an image generated with the input prompt
-
-```python
-import requests
-import torch
-
-from diffusers import DDIMScheduler, StableDiffusionPix2PixZeroPipeline
-
-
-def download(embedding_url, local_filepath):
-    r = requests.get(embedding_url)
-    with open(local_filepath, "wb") as f:
-        f.write(r.content)
-
-
-model_ckpt = "CompVis/stable-diffusion-v1-4"
-pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-    model_ckpt, conditions_input_image=False, torch_dtype=torch.float16
-)
-pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
-pipeline.to("cuda")
-
-prompt = "a high resolution painting of a cat in the style of van gogh"
-src_embs_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/embeddings_sd_1.4/cat.pt"
-target_embs_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/embeddings_sd_1.4/dog.pt"
-
-for url in [src_embs_url, target_embs_url]:
-    download(url, url.split("/")[-1])
-
-src_embeds = torch.load(src_embs_url.split("/")[-1])
-target_embeds = torch.load(target_embs_url.split("/")[-1])
-
-image = pipeline(
-    prompt,
-    source_embeds=src_embeds,
-    target_embeds=target_embeds,
-    num_inference_steps=50,
-    cross_attention_guidance_amount=0.15,
-).images[0]
-image
-```
-
-### Based on an input image
-
-When the pipeline is conditioned on an input image, we first obtain an inverted
-noise from it using a `DDIMInverseScheduler` with the help of a generated caption. Then the inverted noise is used to start the generation process.
-
-First, let's load our pipeline:
-
-```py
-import torch
-from transformers import BlipForConditionalGeneration, BlipProcessor
-from diffusers import DDIMScheduler, DDIMInverseScheduler, StableDiffusionPix2PixZeroPipeline
-
-captioner_id = "Salesforce/blip-image-captioning-base"
-processor = BlipProcessor.from_pretrained(captioner_id)
-model = BlipForConditionalGeneration.from_pretrained(captioner_id, torch_dtype=torch.float16, low_cpu_mem_usage=True)
-
-sd_model_ckpt = "CompVis/stable-diffusion-v1-4"
-pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-    sd_model_ckpt,
-    caption_generator=model,
-    caption_processor=processor,
-    torch_dtype=torch.float16,
-    safety_checker=None,
-)
-pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
-pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
-pipeline.enable_model_cpu_offload()
-```
-
-Then, we load an input image for conditioning and obtain a suitable caption for it:
-
-```py
-from diffusers.utils import load_image
-
-img_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/test_images/cats/cat_6.png"
-raw_image = load_image(url).resize((512, 512))
-caption = pipeline.generate_caption(raw_image)
-caption
-```
-
-Then we employ the generated caption and the input image to get the inverted noise:
-
-```py
-generator = torch.manual_seed(0)
-inv_latents = pipeline.invert(caption, image=raw_image, generator=generator).latents
-```
-
-Now, generate the image with edit directions:
-
-```py
-# See the "Generating source and target embeddings" section below to
-# automate the generation of these captions with a pre-trained model like Flan-T5 as explained below.
-source_prompts = ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
-target_prompts = ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
-
-source_embeds = pipeline.get_embeds(source_prompts, batch_size=2)
-target_embeds = pipeline.get_embeds(target_prompts, batch_size=2)
-
-
-image = pipeline(
-    caption,
-    source_embeds=source_embeds,
-    target_embeds=target_embeds,
-    num_inference_steps=50,
-    cross_attention_guidance_amount=0.15,
-    generator=generator,
-    latents=inv_latents,
-    negative_prompt=caption,
-).images[0]
-image
-```
-
-## Generating source and target embeddings
-
-The authors originally used the [GPT-3 API](https://openai.com/api/) to generate the source and target captions for discovering
-edit directions. However, we can also leverage open source and public models for the same purpose.
-Below, we provide an end-to-end example with the [Flan-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5) model
-for generating captions and [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for
-computing embeddings on the generated captions.
-
-**1. Load the generation model**:
-
-```py
-import torch
-from transformers import AutoTokenizer, T5ForConditionalGeneration
-
-tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
-model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto", torch_dtype=torch.float16)
-```
-
-**2. Construct a starting prompt**:
-
-```py
-source_concept = "cat"
-target_concept = "dog"
-
-source_text = f"Provide a caption for images containing a {source_concept}. "
-"The captions should be in English and should be no longer than 150 characters."
-
-target_text = f"Provide a caption for images containing a {target_concept}. "
-"The captions should be in English and should be no longer than 150 characters."
-```
-
-Here, we're interested in the "cat -> dog" direction.
-
-**3. Generate captions**:
-
-We can use a utility like so for this purpose.
-
-```py
-def generate_captions(input_prompt):
-    input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to("cuda")
-
-    outputs = model.generate(
-        input_ids, temperature=0.8, num_return_sequences=16, do_sample=True, max_new_tokens=128, top_k=10
-    )
-    return tokenizer.batch_decode(outputs, skip_special_tokens=True)
-```
-
-And then we just call it to generate our captions:
-
-```py
-source_captions = generate_captions(source_text)
-target_captions = generate_captions(target_concept)
-print(source_captions, target_captions, sep='\n')
-```
-
-We encourage you to play around with the different parameters supported by the
-`generate()` method ([documentation](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.generation_tf_utils.TFGenerationMixin.generate)) for the generation quality you are looking for.
-
-**4. Load the embedding model**:
-
-Here, we need to use the same text encoder model used by the subsequent Stable Diffusion model.
-
-```py
-from diffusers import StableDiffusionPix2PixZeroPipeline
-
-pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
-)
-pipeline = pipeline.to("cuda")
-tokenizer = pipeline.tokenizer
-text_encoder = pipeline.text_encoder
-```
-
-**5. Compute embeddings**:
-
-```py
-import torch
-
-def embed_captions(sentences, tokenizer, text_encoder, device="cuda"):
-    with torch.no_grad():
-        embeddings = []
-        for sent in sentences:
-            text_inputs = tokenizer(
-                sent,
-                padding="max_length",
-                max_length=tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            prompt_embeds = text_encoder(text_input_ids.to(device), attention_mask=None)[0]
-            embeddings.append(prompt_embeds)
-    return torch.concatenate(embeddings, dim=0).mean(dim=0).unsqueeze(0)
-
-source_embeddings = embed_captions(source_captions, tokenizer, text_encoder)
-target_embeddings = embed_captions(target_captions, tokenizer, text_encoder)
-```
-
-And you're done! [Here](https://colab.research.google.com/drive/1tz2C1EdfZYAPlzXXbTnf-5PRBiR8_R1F?usp=sharing) is a Colab Notebook that you can use to interact with the entire process.
-
-Now, you can use these embeddings directly while calling the pipeline:
-
-```py
-from diffusers import DDIMScheduler
-
-pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
-
-image = pipeline(
-    prompt,
-    source_embeds=source_embeddings,
-    target_embeds=target_embeddings,
-    num_inference_steps=50,
-    cross_attention_guidance_amount=0.15,
-).images[0]
-image
-```
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## StableDiffusionPix2PixZeroPipeline
-[[autodoc]] StableDiffusionPix2PixZeroPipeline
-	- __call__
-	- all
diff --git a/docs/source/en/api/pipelines/pndm.md b/docs/source/en/api/pipelines/pndm.md
deleted file mode 100644
index 162e7934dc22..000000000000
--- a/docs/source/en/api/pipelines/pndm.md
+++ /dev/null
@@ -1,35 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# PNDM
-
-[Pseudo Numerical Methods for Diffusion Models on Manifolds](https://huggingface.co/papers/2202.09778) (PNDM) is by Luping Liu, Yi Ren, Zhijie Lin and Zhou Zhao.
-
-The abstract from the paper is:
-
-*Denoising Diffusion Probabilistic Models (DDPMs) can generate high-quality samples such as image and audio samples. However, DDPMs require hundreds to thousands of iterations to produce final samples. Several prior works have successfully accelerated DDPMs through adjusting the variance schedule (e.g., Improved Denoising Diffusion Probabilistic Models) or the denoising equation (e.g., Denoising Diffusion Implicit Models (DDIMs)). However, these acceleration methods cannot maintain the quality of samples and even introduce new noise at a high speedup rate, which limit their practicability. To accelerate the inference process while keeping the sample quality, we provide a fresh perspective that DDPMs should be treated as solving differential equations on manifolds. Under such a perspective, we propose pseudo numerical methods for diffusion models (PNDMs). Specifically, we figure out how to solve differential equations on manifolds and show that DDIMs are simple cases of pseudo numerical methods. We change several classical numerical methods to corresponding pseudo numerical methods and find that the pseudo linear multi-step method is the best in most situations. According to our experiments, by directly using pre-trained models on Cifar10, CelebA and LSUN, PNDMs can generate higher quality synthetic images with only 50 steps compared with 1000-step DDIMs (20x speedup), significantly outperform DDIMs with 250 steps (by around 0.4 in FID) and have good generalization on different variance schedules.*
-
-The original codebase can be found at [luping-liu/PNDM](https://github.com/luping-liu/PNDM).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## PNDMPipeline
-[[autodoc]] PNDMPipeline
-	- all
-	- __call__
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
diff --git a/docs/source/en/api/pipelines/repaint.md b/docs/source/en/api/pipelines/repaint.md
deleted file mode 100644
index 1be69a3f9a46..000000000000
--- a/docs/source/en/api/pipelines/repaint.md
+++ /dev/null
@@ -1,37 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# RePaint
-
-[RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://huggingface.co/papers/2201.09865) is by Andreas Lugmayr, Martin Danelljan, Andres Romero, Fisher Yu, Radu Timofte, Luc Van Gool.
-
-The abstract from the paper is:
-
-*Free-form inpainting is the task of adding new content to an image in the regions specified by an arbitrary binary mask. Most existing approaches train for a certain distribution of masks, which limits their generalization capabilities to unseen mask types. Furthermore, training with pixel-wise and perceptual losses often leads to simple textural extensions towards the missing areas instead of semantically meaningful generation. In this work, we propose RePaint: A Denoising Diffusion Probabilistic Model (DDPM) based inpainting approach that is applicable to even extreme masks. We employ a pretrained unconditional DDPM as the generative prior. To condition the generation process, we only alter the reverse diffusion iterations by sampling the unmasked regions using the given image information. Since this technique does not modify or condition the original DDPM network itself, the model produces high-quality and diverse output images for any inpainting form. We validate our method for both faces and general-purpose image inpainting using standard and extreme masks.
-RePaint outperforms state-of-the-art Autoregressive, and GAN approaches for at least five out of six mask distributions.*
-
-The original codebase can be found at [andreas128/RePaint](https://github.com/andreas128/RePaint).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-
-## RePaintPipeline
-[[autodoc]] RePaintPipeline
-	- all
-	- __call__
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
diff --git a/docs/source/en/api/pipelines/score_sde_ve.md b/docs/source/en/api/pipelines/score_sde_ve.md
deleted file mode 100644
index cc9c8574f92d..000000000000
--- a/docs/source/en/api/pipelines/score_sde_ve.md
+++ /dev/null
@@ -1,35 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Score SDE VE
-
-[Score-Based Generative Modeling through Stochastic Differential Equations](https://huggingface.co/papers/2011.13456) (Score SDE) is by Yang Song, Jascha Sohl-Dickstein, Diederik P. Kingma, Abhishek Kumar, Stefano Ermon and Ben Poole. This pipeline implements the variance expanding (VE) variant of the stochastic differential equation method.
-
-The abstract from the paper is:
-
-*Creating noise from data is easy; creating data from noise is generative modeling. We present a stochastic differential equation (SDE) that smoothly transforms a complex data distribution to a known prior distribution by slowly injecting noise, and a corresponding reverse-time SDE that transforms the prior distribution back into the data distribution by slowly removing the noise. Crucially, the reverse-time SDE depends only on the time-dependent gradient field (\aka, score) of the perturbed data distribution. By leveraging advances in score-based generative modeling, we can accurately estimate these scores with neural networks, and use numerical SDE solvers to generate samples. We show that this framework encapsulates previous approaches in score-based generative modeling and diffusion probabilistic modeling, allowing for new sampling procedures and new modeling capabilities. In particular, we introduce a predictor-corrector framework to correct errors in the evolution of the discretized reverse-time SDE. We also derive an equivalent neural ODE that samples from the same distribution as the SDE, but additionally enables exact likelihood computation, and improved sampling efficiency. In addition, we provide a new way to solve inverse problems with score-based models, as demonstrated with experiments on class-conditional generation, image inpainting, and colorization. Combined with multiple architectural improvements, we achieve record-breaking performance for unconditional image generation on CIFAR-10 with an Inception score of 9.89 and FID of 2.20, a competitive likelihood of 2.99 bits/dim, and demonstrate high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model.*
-
-The original codebase can be found at [yang-song/score_sde_pytorch](https://github.com/yang-song/score_sde_pytorch).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## ScoreSdeVePipeline
-[[autodoc]] ScoreSdeVePipeline
-	- all
-	- __call__
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
diff --git a/docs/source/en/api/pipelines/spectrogram_diffusion.md b/docs/source/en/api/pipelines/spectrogram_diffusion.md
deleted file mode 100644
index cc9ff3e45646..000000000000
--- a/docs/source/en/api/pipelines/spectrogram_diffusion.md
+++ /dev/null
@@ -1,37 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Spectrogram Diffusion
-
-[Spectrogram Diffusion](https://huggingface.co/papers/2206.05408) is by Curtis Hawthorne, Ian Simon, Adam Roberts, Neil Zeghidour, Josh Gardner, Ethan Manilow, and Jesse Engel.
-
-*An ideal music synthesizer should be both interactive and expressive, generating high-fidelity audio in realtime for arbitrary combinations of instruments and notes. Recent neural synthesizers have exhibited a tradeoff between domain-specific models that offer detailed control of only specific instruments, or raw waveform models that can train on any music but with minimal control and slow generation. In this work, we focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. We use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. We compare training the decoder as an autoregressive model and as a Denoising Diffusion Probabilistic Model (DDPM) and find that the DDPM approach is superior both qualitatively and as measured by audio reconstruction and Fréchet distance metrics. Given the interactivity and generality of this approach, we find this to be a promising first step towards interactive and expressive neural synthesis for arbitrary combinations of instruments and notes.*
-
-The original codebase can be found at [magenta/music-spectrogram-diffusion](https://github.com/magenta/music-spectrogram-diffusion).
-
-![img](https://storage.googleapis.com/music-synthesis-with-spectrogram-diffusion/architecture.png)
-
-As depicted above the model takes as input a MIDI file and tokenizes it into a sequence of 5 second intervals. Each tokenized interval then together with positional encodings is passed through the Note Encoder and its representation is concatenated with the previous window's generated spectrogram representation obtained via the Context Encoder. For the initial 5 second window this is set to zero. The resulting context is then used as conditioning to sample the denoised Spectrogram from the MIDI window and we concatenate this spectrogram to the final output as well as use it for the context of the next MIDI window. The process repeats till we have gone over all the MIDI inputs. Finally a MelGAN decoder converts the potentially long spectrogram to audio which is the final result of this pipeline.
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## SpectrogramDiffusionPipeline
-[[autodoc]] SpectrogramDiffusionPipeline
-	- all
-	- __call__
-
-## AudioPipelineOutput
-[[autodoc]] pipelines.AudioPipelineOutput
diff --git a/docs/source/en/api/pipelines/stochastic_karras_ve.md b/docs/source/en/api/pipelines/stochastic_karras_ve.md
deleted file mode 100644
index 0e3f1a5b8333..000000000000
--- a/docs/source/en/api/pipelines/stochastic_karras_ve.md
+++ /dev/null
@@ -1,33 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Stochastic Karras VE
-
-[Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) is by Tero Karras, Miika Aittala, Timo Aila and Samuli Laine. This pipeline implements the stochastic sampling tailored to variance expanding (VE) models.
-
-The abstract from the paper:
-
-*We argue that the theory and practice of diffusion-based generative models are currently unnecessarily convoluted and seek to remedy the situation by presenting a design space that clearly separates the concrete design choices. This lets us identify several changes to both the sampling and training processes, as well as preconditioning of the score networks. Together, our improvements yield new state-of-the-art FID of 1.79 for CIFAR-10 in a class-conditional setting and 1.97 in an unconditional setting, with much faster sampling (35 network evaluations per image) than prior designs. To further demonstrate their modular nature, we show that our design changes dramatically improve both the efficiency and quality obtainable with pre-trained score networks from previous work, including improving the FID of a previously trained ImageNet-64 model from 2.07 to near-SOTA 1.55, and after re-training with our proposed improvements to a new SOTA of 1.36.*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## KarrasVePipeline
-[[autodoc]] KarrasVePipeline
-	- all
-	- __call__
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
diff --git a/docs/source/en/api/pipelines/versatile_diffusion.md b/docs/source/en/api/pipelines/versatile_diffusion.md
deleted file mode 100644
index 953f4822486a..000000000000
--- a/docs/source/en/api/pipelines/versatile_diffusion.md
+++ /dev/null
@@ -1,54 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Versatile Diffusion
-
-Versatile Diffusion was proposed in [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://huggingface.co/papers/2211.08332) by Xingqian Xu, Zhangyang Wang, Eric Zhang, Kai Wang, Humphrey Shi.
-
-The abstract from the paper is:
-
-*Recent advances in diffusion models have set an impressive milestone in many generation tasks, and trending works such as DALL-E2, Imagen, and Stable Diffusion have attracted great interest. Despite the rapid landscape changes, recent new approaches focus on extensions and performance rather than capacity, thus requiring separate models for separate tasks. In this work, we expand the existing single-flow diffusion pipeline into a multi-task multimodal network, dubbed Versatile Diffusion (VD), that handles multiple flows of text-to-image, image-to-text, and variations in one unified model. The pipeline design of VD instantiates a unified multi-flow diffusion framework, consisting of sharable and swappable layer modules that enable the crossmodal generality beyond images and text. Through extensive experiments, we demonstrate that VD successfully achieves the following: a) VD outperforms the baseline approaches and handles all its base tasks with competitive quality; b) VD enables novel extensions such as disentanglement of style and semantics, dual- and multi-context blending, etc.; c) The success of our multi-flow multimodal framework over images and text may inspire further diffusion-based universal AI research.*
-
-## Tips
-
-You can load the more memory intensive "all-in-one" [`VersatileDiffusionPipeline`] that supports all the tasks or use the individual pipelines which are more memory efficient.
-
-| **Pipeline**                                         | **Supported tasks**               |
-|------------------------------------------------------|-----------------------------------|
-| [`VersatileDiffusionPipeline`]                       | all of the below                  |
-| [`VersatileDiffusionTextToImagePipeline`]            | text-to-image                     |
-| [`VersatileDiffusionImageVariationPipeline`]         | image variation                   |
-| [`VersatileDiffusionDualGuidedPipeline`]             | image-text dual guided generation |
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## VersatileDiffusionPipeline
-[[autodoc]] VersatileDiffusionPipeline
-
-## VersatileDiffusionTextToImagePipeline
-[[autodoc]] VersatileDiffusionTextToImagePipeline
-	- all
-	- __call__
-
-## VersatileDiffusionImageVariationPipeline
-[[autodoc]] VersatileDiffusionImageVariationPipeline
-	- all
-	- __call__
-
-## VersatileDiffusionDualGuidedPipeline
-[[autodoc]] VersatileDiffusionDualGuidedPipeline
-	- all
-	- __call__
diff --git a/docs/source/en/api/pipelines/vq_diffusion.md b/docs/source/en/api/pipelines/vq_diffusion.md
deleted file mode 100644
index f2b0db716123..000000000000
--- a/docs/source/en/api/pipelines/vq_diffusion.md
+++ /dev/null
@@ -1,35 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# VQ Diffusion
-
-[Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://huggingface.co/papers/2111.14822) is by Shuyang Gu, Dong Chen, Jianmin Bao, Fang Wen, Bo Zhang, Dongdong Chen, Lu Yuan, Baining Guo.
-
-The abstract from the paper is:
-
-*We present the vector quantized diffusion (VQ-Diffusion) model for text-to-image generation. This method is based on a vector quantized variational autoencoder (VQ-VAE) whose latent space is modeled by a conditional variant of the recently developed Denoising Diffusion Probabilistic Model (DDPM). We find that this latent-space method is well-suited for text-to-image generation tasks because it not only eliminates the unidirectional bias with existing methods but also allows us to incorporate a mask-and-replace diffusion strategy to avoid the accumulation of errors, which is a serious problem with existing methods. Our experiments show that the VQ-Diffusion produces significantly better text-to-image generation results when compared with conventional autoregressive (AR) models with similar numbers of parameters. Compared with previous GAN-based text-to-image methods, our VQ-Diffusion can handle more complex scenes and improve the synthesized image quality by a large margin. Finally, we show that the image generation computation in our method can be made highly efficient by reparameterization. With traditional AR methods, the text-to-image generation time increases linearly with the output image resolution and hence is quite time consuming even for normal size images. The VQ-Diffusion allows us to achieve a better trade-off between quality and speed. Our experiments indicate that the VQ-Diffusion model with the reparameterization is fifteen times faster than traditional AR methods while achieving a better image quality.*
-
-The original codebase can be found at [microsoft/VQ-Diffusion](https://github.com/microsoft/VQ-Diffusion).
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## VQDiffusionPipeline
-[[autodoc]] VQDiffusionPipeline
-	- all
-	- __call__
-
-## ImagePipelineOutput
-[[autodoc]] pipelines.ImagePipelineOutput
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 5e7b2e285f73..e7d34b623711 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -20,6 +20,7 @@
 _import_structure = {
     "controlnet": [],
     "controlnet_xs": [],
+    "deprecated": [],
     "latent_diffusion": [],
     "stable_diffusion": [],
     "stable_diffusion_xl": [],
@@ -44,16 +45,20 @@
     _import_structure["ddpm"] = ["DDPMPipeline"]
     _import_structure["dit"] = ["DiTPipeline"]
     _import_structure["latent_diffusion"].extend(["LDMSuperResolutionPipeline"])
-    _import_structure["latent_diffusion_uncond"] = ["LDMPipeline"]
     _import_structure["pipeline_utils"] = [
         "AudioPipelineOutput",
         "DiffusionPipeline",
         "ImagePipelineOutput",
     ]
-    _import_structure["pndm"] = ["PNDMPipeline"]
-    _import_structure["repaint"] = ["RePaintPipeline"]
-    _import_structure["score_sde_ve"] = ["ScoreSdeVePipeline"]
-    _import_structure["stochastic_karras_ve"] = ["KarrasVePipeline"]
+    _import_structure["deprecated"].extend(
+        [
+            "PNDMPipeline",
+            "LDMPipeline",
+            "RePaintPipeline",
+            "ScoreSdeVePipeline",
+            "KarrasVePipeline",
+        ]
+    )
 try:
     if not (is_torch_available() and is_librosa_available()):
         raise OptionalDependencyNotAvailable()
@@ -62,7 +67,23 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_librosa_objects))
 else:
-    _import_structure["audio_diffusion"] = ["AudioDiffusionPipeline", "Mel"]
+    _import_structure["deprecated"].extend(["AudioDiffusionPipeline", "Mel"])
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
+else:
+    _import_structure["deprecated"].extend(
+        [
+            "MidiProcessor",
+            "SpectrogramDiffusionPipeline",
+        ]
+    )
+
 try:
     if not (is_torch_available() and is_transformers_available()):
         raise OptionalDependencyNotAvailable()
@@ -71,10 +92,22 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["alt_diffusion"] = [
-        "AltDiffusionImg2ImgPipeline",
-        "AltDiffusionPipeline",
-    ]
+    _import_structure["deprecated"].extend(
+        [
+            "VQDiffusionPipeline",
+            "AltDiffusionPipeline",
+            "AltDiffusionImg2ImgPipeline",
+            "CycleDiffusionPipeline",
+            "StableDiffusionInpaintPipelineLegacy",
+            "StableDiffusionPix2PixZeroPipeline",
+            "StableDiffusionParadigmsPipeline",
+            "StableDiffusionModelEditingPipeline",
+            "VersatileDiffusionDualGuidedPipeline",
+            "VersatileDiffusionImageVariationPipeline",
+            "VersatileDiffusionPipeline",
+            "VersatileDiffusionTextToImagePipeline",
+        ]
+    )
     _import_structure["animatediff"] = ["AnimateDiffPipeline"]
     _import_structure["audioldm"] = ["AudioLDMPipeline"]
     _import_structure["audioldm2"] = [
@@ -146,7 +179,6 @@
     _import_structure["stable_diffusion"].extend(
         [
             "CLIPImageProjection",
-            "CycleDiffusionPipeline",
             "StableDiffusionAttendAndExcitePipeline",
             "StableDiffusionDepth2ImgPipeline",
             "StableDiffusionDiffEditPipeline",
@@ -156,15 +188,11 @@
             "StableDiffusionImageVariationPipeline",
             "StableDiffusionImg2ImgPipeline",
             "StableDiffusionInpaintPipeline",
-            "StableDiffusionInpaintPipelineLegacy",
             "StableDiffusionInstructPix2PixPipeline",
             "StableDiffusionLatentUpscalePipeline",
             "StableDiffusionLDM3DPipeline",
-            "StableDiffusionModelEditingPipeline",
             "StableDiffusionPanoramaPipeline",
-            "StableDiffusionParadigmsPipeline",
             "StableDiffusionPipeline",
-            "StableDiffusionPix2PixZeroPipeline",
             "StableDiffusionSAGPipeline",
             "StableDiffusionUpscalePipeline",
             "StableUnCLIPImg2ImgPipeline",
@@ -198,13 +226,6 @@
         "UniDiffuserPipeline",
         "UniDiffuserTextDecoder",
     ]
-    _import_structure["versatile_diffusion"] = [
-        "VersatileDiffusionDualGuidedPipeline",
-        "VersatileDiffusionImageVariationPipeline",
-        "VersatileDiffusionPipeline",
-        "VersatileDiffusionTextToImagePipeline",
-    ]
-    _import_structure["vq_diffusion"] = ["VQDiffusionPipeline"]
     _import_structure["wuerstchen"] = [
         "WuerstchenCombinedPipeline",
         "WuerstchenDecoderPipeline",
@@ -231,7 +252,6 @@
         [
             "OnnxStableDiffusionImg2ImgPipeline",
             "OnnxStableDiffusionInpaintPipeline",
-            "OnnxStableDiffusionInpaintPipelineLegacy",
             "OnnxStableDiffusionPipeline",
             "OnnxStableDiffusionUpscalePipeline",
             "StableDiffusionOnnxPipeline",
@@ -279,18 +299,6 @@
             "FlaxStableDiffusionXLPipeline",
         ]
     )
-try:
-    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
-
-    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
-else:
-    _import_structure["spectrogram_diffusion"] = [
-        "MidiProcessor",
-        "SpectrogramDiffusionPipeline",
-    ]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -309,18 +317,14 @@
         from .dance_diffusion import DanceDiffusionPipeline
         from .ddim import DDIMPipeline
         from .ddpm import DDPMPipeline
+        from .deprecated import KarrasVePipeline, LDMPipeline, PNDMPipeline, RePaintPipeline, ScoreSdeVePipeline
         from .dit import DiTPipeline
         from .latent_diffusion import LDMSuperResolutionPipeline
-        from .latent_diffusion_uncond import LDMPipeline
         from .pipeline_utils import (
             AudioPipelineOutput,
             DiffusionPipeline,
             ImagePipelineOutput,
         )
-        from .pndm import PNDMPipeline
-        from .repaint import RePaintPipeline
-        from .score_sde_ve import ScoreSdeVePipeline
-        from .stochastic_karras_ve import KarrasVePipeline
 
     try:
         if not (is_torch_available() and is_librosa_available()):
@@ -328,7 +332,7 @@
     except OptionalDependencyNotAvailable:
         from ..utils.dummy_torch_and_librosa_objects import *
     else:
-        from .audio_diffusion import AudioDiffusionPipeline, Mel
+        from .deprecated import AudioDiffusionPipeline, Mel
 
     try:
         if not (is_torch_available() and is_transformers_available()):
@@ -336,7 +340,6 @@
     except OptionalDependencyNotAvailable:
         from ..utils.dummy_torch_and_transformers_objects import *
     else:
-        from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline
         from .animatediff import AnimateDiffPipeline
         from .audioldm import AudioLDMPipeline
         from .audioldm2 import (
@@ -366,6 +369,20 @@
             IFPipeline,
             IFSuperResolutionPipeline,
         )
+        from .deprecated import (
+            AltDiffusionImg2ImgPipeline,
+            AltDiffusionPipeline,
+            CycleDiffusionPipeline,
+            StableDiffusionInpaintPipelineLegacy,
+            StableDiffusionModelEditingPipeline,
+            StableDiffusionParadigmsPipeline,
+            StableDiffusionPix2PixZeroPipeline,
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+            VQDiffusionPipeline,
+        )
         from .kandinsky import (
             KandinskyCombinedPipeline,
             KandinskyImg2ImgCombinedPipeline,
@@ -403,7 +420,6 @@
         from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
         from .stable_diffusion import (
             CLIPImageProjection,
-            CycleDiffusionPipeline,
             StableDiffusionAttendAndExcitePipeline,
             StableDiffusionDepth2ImgPipeline,
             StableDiffusionDiffEditPipeline,
@@ -412,15 +428,11 @@
             StableDiffusionImageVariationPipeline,
             StableDiffusionImg2ImgPipeline,
             StableDiffusionInpaintPipeline,
-            StableDiffusionInpaintPipelineLegacy,
             StableDiffusionInstructPix2PixPipeline,
             StableDiffusionLatentUpscalePipeline,
             StableDiffusionLDM3DPipeline,
-            StableDiffusionModelEditingPipeline,
             StableDiffusionPanoramaPipeline,
-            StableDiffusionParadigmsPipeline,
             StableDiffusionPipeline,
-            StableDiffusionPix2PixZeroPipeline,
             StableDiffusionSAGPipeline,
             StableDiffusionUpscalePipeline,
             StableUnCLIPImg2ImgPipeline,
@@ -451,13 +463,6 @@
             UniDiffuserPipeline,
             UniDiffuserTextDecoder,
         )
-        from .versatile_diffusion import (
-            VersatileDiffusionDualGuidedPipeline,
-            VersatileDiffusionImageVariationPipeline,
-            VersatileDiffusionPipeline,
-            VersatileDiffusionTextToImagePipeline,
-        )
-        from .vq_diffusion import VQDiffusionPipeline
         from .wuerstchen import (
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
@@ -482,7 +487,6 @@
             from .stable_diffusion import (
                 OnnxStableDiffusionImg2ImgPipeline,
                 OnnxStableDiffusionInpaintPipeline,
-                OnnxStableDiffusionInpaintPipelineLegacy,
                 OnnxStableDiffusionPipeline,
                 OnnxStableDiffusionUpscalePipeline,
                 StableDiffusionOnnxPipeline,
@@ -527,7 +531,7 @@
             from ..utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
 
         else:
-            from .spectrogram_diffusion import (
+            from .deprecated import (
                 MidiProcessor,
                 SpectrogramDiffusionPipeline,
             )
diff --git a/src/diffusers/pipelines/deprecated/README.md b/src/diffusers/pipelines/deprecated/README.md
new file mode 100644
index 000000000000..1e21dbbbd96c
--- /dev/null
+++ b/src/diffusers/pipelines/deprecated/README.md
@@ -0,0 +1,3 @@
+# Deprecated Pipelines
+
+This folder contains pipelines that have very low usage as measured by model downloads, issues and PRs. While you can still use the pipelines just as before, we will stop testing the pipelines and will not accept any changes to existing files.
\ No newline at end of file
diff --git a/src/diffusers/pipelines/deprecated/__init__.py b/src/diffusers/pipelines/deprecated/__init__.py
new file mode 100644
index 000000000000..9936323170ad
--- /dev/null
+++ b/src/diffusers/pipelines/deprecated/__init__.py
@@ -0,0 +1,153 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_librosa_available,
+    is_note_seq_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_pt_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_pt_objects))
+else:
+    _import_structure["latent_diffusion_uncond"] = ["LDMPipeline"]
+    _import_structure["pndm"] = ["PNDMPipeline"]
+    _import_structure["repaint"] = ["RePaintPipeline"]
+    _import_structure["score_sde_ve"] = ["ScoreSdeVePipeline"]
+    _import_structure["stochastic_karras_ve"] = ["KarrasVePipeline"]
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["alt_diffusion"] = [
+        "AltDiffusionImg2ImgPipeline",
+        "AltDiffusionPipeline",
+        "AltDiffusionPipelineOutput",
+    ]
+    _import_structure["versatile_diffusion"] = [
+        "VersatileDiffusionDualGuidedPipeline",
+        "VersatileDiffusionImageVariationPipeline",
+        "VersatileDiffusionPipeline",
+        "VersatileDiffusionTextToImagePipeline",
+    ]
+    _import_structure["vq_diffusion"] = ["VQDiffusionPipeline"]
+    _import_structure["stable_diffusion_variants"] = [
+        "CycleDiffusionPipeline",
+        "StableDiffusionInpaintPipelineLegacy",
+        "StableDiffusionPix2PixZeroPipeline",
+        "StableDiffusionParadigmsPipeline",
+        "StableDiffusionModelEditingPipeline",
+    ]
+
+try:
+    if not (is_torch_available() and is_librosa_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_librosa_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_librosa_objects))
+
+else:
+    _import_structure["audio_diffusion"] = ["AudioDiffusionPipeline", "Mel"]
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
+
+else:
+    _import_structure["spectrogram_diffusion"] = ["MidiProcessor", "SpectrogramDiffusionPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_pt_objects import *
+
+    else:
+        from .latent_diffusion_uncond import LDMPipeline
+        from .pndm import PNDMPipeline
+        from .repaint import RePaintPipeline
+        from .score_sde_ve import ScoreSdeVePipeline
+        from .stochastic_karras_ve import KarrasVePipeline
+
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline, AltDiffusionPipelineOutput
+        from .audio_diffusion import AudioDiffusionPipeline, Mel
+        from .spectrogram_diffusion import SpectrogramDiffusionPipeline
+        from .stable_diffusion_variants import (
+            CycleDiffusionPipeline,
+            StableDiffusionInpaintPipelineLegacy,
+            StableDiffusionModelEditingPipeline,
+            StableDiffusionParadigmsPipeline,
+            StableDiffusionPix2PixZeroPipeline,
+        )
+        from .stochastic_karras_ve import KarrasVePipeline
+        from .versatile_diffusion import (
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+        )
+        from .vq_diffusion import VQDiffusionPipeline
+
+    try:
+        if not (is_torch_available() and is_librosa_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_librosa_objects import *
+    else:
+        from .audio_diffusion import AudioDiffusionPipeline, Mel
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+    else:
+        from .spectrogram_diffusion import (
+            MidiProcessor,
+            SpectrogramDiffusionPipeline,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/alt_diffusion/__init__.py b/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py
similarity index 91%
rename from src/diffusers/pipelines/alt_diffusion/__init__.py
rename to src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py
index 05c86f0a160e..71fa15b3feff 100644
--- a/src/diffusers/pipelines/alt_diffusion/__init__.py
+++ b/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py
@@ -1,6 +1,6 @@
 from typing import TYPE_CHECKING
 
-from ...utils import (
+from ....utils import (
     DIFFUSERS_SLOW_IMPORT,
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -17,7 +17,7 @@
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils import dummy_torch_and_transformers_objects
+    from ....utils import dummy_torch_and_transformers_objects
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
@@ -32,7 +32,7 @@
         if not (is_transformers_available() and is_torch_available()):
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
-        from ...utils.dummy_torch_and_transformers_objects import *
+        from ....utils.dummy_torch_and_transformers_objects import *
 
     else:
         from .modeling_roberta_series import RobertaSeriesModelWithTransformation
diff --git a/src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py b/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py
similarity index 100%
rename from src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py
rename to src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
similarity index 98%
rename from src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
rename to src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
index dfeddab6dced..45e82a28d2e0 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
@@ -19,14 +19,14 @@
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, XLMRobertaTokenizer
 
-from ...configuration_utils import FrozenDict
-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import FusedAttnProcessor2_0
-from ...models.lora import adjust_lora_scale_text_encoder
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (
+from ....configuration_utils import FrozenDict
+from ....image_processor import PipelineImageInput, VaeImageProcessor
+from ....loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ....models.attention_processor import FusedAttnProcessor2_0
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import (
     USE_PEFT_BACKEND,
     deprecate,
     logging,
@@ -34,9 +34,9 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from .modeling_roberta_series import RobertaSeriesModelWithTransformation
 from .pipeline_output import AltDiffusionPipelineOutput
 
@@ -119,7 +119,6 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker
 class AltDiffusionPipeline(
     DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
 ):
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
similarity index 98%
rename from src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
rename to src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
index d87a9eaa1e8d..9838bb9e5ba6 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -21,14 +21,14 @@
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, XLMRobertaTokenizer
 
-from ...configuration_utils import FrozenDict
-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import FusedAttnProcessor2_0
-from ...models.lora import adjust_lora_scale_text_encoder
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (
+from ....configuration_utils import FrozenDict
+from ....image_processor import PipelineImageInput, VaeImageProcessor
+from ....loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ....models.attention_processor import FusedAttnProcessor2_0
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import (
     PIL_INTERPOLATION,
     USE_PEFT_BACKEND,
     deprecate,
@@ -37,9 +37,9 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from .modeling_roberta_series import RobertaSeriesModelWithTransformation
 from .pipeline_output import AltDiffusionPipelineOutput
 
@@ -159,7 +159,6 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker
 class AltDiffusionImg2ImgPipeline(
     DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
 ):
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_output.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py
similarity index 97%
rename from src/diffusers/pipelines/alt_diffusion/pipeline_output.py
rename to src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py
index 997e187af6c1..dd174ae3c21f 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_output.py
+++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py
@@ -4,7 +4,7 @@
 import numpy as np
 import PIL.Image
 
-from ...utils import (
+from ....utils import (
     BaseOutput,
 )
 
diff --git a/src/diffusers/pipelines/audio_diffusion/__init__.py b/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py
similarity index 88%
rename from src/diffusers/pipelines/audio_diffusion/__init__.py
rename to src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py
index d41c166a1ccb..3127951863a7 100644
--- a/src/diffusers/pipelines/audio_diffusion/__init__.py
+++ b/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py
@@ -1,6 +1,6 @@
 from typing import TYPE_CHECKING
 
-from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
 
 
 _import_structure = {
diff --git a/src/diffusers/pipelines/audio_diffusion/mel.py b/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py
similarity index 97%
rename from src/diffusers/pipelines/audio_diffusion/mel.py
rename to src/diffusers/pipelines/deprecated/audio_diffusion/mel.py
index 38a11cdaab7d..0e33825787bb 100644
--- a/src/diffusers/pipelines/audio_diffusion/mel.py
+++ b/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py
@@ -15,8 +15,8 @@
 
 import numpy as np  # noqa: E402
 
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...schedulers.scheduling_utils import SchedulerMixin
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....schedulers.scheduling_utils import SchedulerMixin
 
 
 try:
diff --git a/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py b/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py
similarity index 98%
rename from src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
rename to src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py
index 6c4ae88b228d..731d22f3def8 100644
--- a/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
+++ b/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py
@@ -20,10 +20,10 @@
 import torch
 from PIL import Image
 
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import DDIMScheduler, DDPMScheduler
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import AudioPipelineOutput, BaseOutput, DiffusionPipeline, ImagePipelineOutput
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....schedulers import DDIMScheduler, DDPMScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import AudioPipelineOutput, BaseOutput, DiffusionPipeline, ImagePipelineOutput
 from .mel import Mel
 
 
diff --git a/src/diffusers/pipelines/latent_diffusion_uncond/__init__.py b/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py
similarity index 87%
rename from src/diffusers/pipelines/latent_diffusion_uncond/__init__.py
rename to src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py
index 8bb291f1b4fd..214f5bbca969 100644
--- a/src/diffusers/pipelines/latent_diffusion_uncond/__init__.py
+++ b/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py
@@ -1,6 +1,6 @@
 from typing import TYPE_CHECKING
 
-from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
 
 
 _import_structure = {"pipeline_latent_diffusion_uncond": ["LDMPipeline"]}
diff --git a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
similarity index 96%
rename from src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
rename to src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
index ffcc8129d19f..4e14d1708ccf 100644
--- a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
+++ b/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
@@ -17,10 +17,10 @@
 
 import torch
 
-from ...models import UNet2DModel, VQModel
-from ...schedulers import DDIMScheduler
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ....models import UNet2DModel, VQModel
+from ....schedulers import DDIMScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
 class LDMPipeline(DiffusionPipeline):
diff --git a/src/diffusers/pipelines/pndm/__init__.py b/src/diffusers/pipelines/deprecated/pndm/__init__.py
similarity index 86%
rename from src/diffusers/pipelines/pndm/__init__.py
rename to src/diffusers/pipelines/deprecated/pndm/__init__.py
index d904abe76800..5e3bdba74079 100644
--- a/src/diffusers/pipelines/pndm/__init__.py
+++ b/src/diffusers/pipelines/deprecated/pndm/__init__.py
@@ -1,6 +1,6 @@
 from typing import TYPE_CHECKING
 
-from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
 
 
 _import_structure = {"pipeline_pndm": ["PNDMPipeline"]}
diff --git a/src/diffusers/pipelines/pndm/pipeline_pndm.py b/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py
similarity index 95%
rename from src/diffusers/pipelines/pndm/pipeline_pndm.py
rename to src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py
index 78690997223a..c988e8292987 100644
--- a/src/diffusers/pipelines/pndm/pipeline_pndm.py
+++ b/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py
@@ -17,10 +17,10 @@
 
 import torch
 
-from ...models import UNet2DModel
-from ...schedulers import PNDMScheduler
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ....models import UNet2DModel
+from ....schedulers import PNDMScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
 class PNDMPipeline(DiffusionPipeline):
diff --git a/src/diffusers/pipelines/repaint/__init__.py b/src/diffusers/pipelines/deprecated/repaint/__init__.py
similarity index 86%
rename from src/diffusers/pipelines/repaint/__init__.py
rename to src/diffusers/pipelines/deprecated/repaint/__init__.py
index b1b42f7a115e..2c6b04af52d4 100644
--- a/src/diffusers/pipelines/repaint/__init__.py
+++ b/src/diffusers/pipelines/deprecated/repaint/__init__.py
@@ -1,6 +1,6 @@
 from typing import TYPE_CHECKING
 
-from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
 
 
 _import_structure = {"pipeline_repaint": ["RePaintPipeline"]}
diff --git a/src/diffusers/pipelines/repaint/pipeline_repaint.py b/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py
similarity index 97%
rename from src/diffusers/pipelines/repaint/pipeline_repaint.py
rename to src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py
index 1bbd3d1d03d4..eeea28d4d06f 100644
--- a/src/diffusers/pipelines/repaint/pipeline_repaint.py
+++ b/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py
@@ -19,11 +19,11 @@
 import PIL.Image
 import torch
 
-from ...models import UNet2DModel
-from ...schedulers import RePaintScheduler
-from ...utils import PIL_INTERPOLATION, deprecate, logging
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ....models import UNet2DModel
+from ....schedulers import RePaintScheduler
+from ....utils import PIL_INTERPOLATION, deprecate, logging
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/src/diffusers/pipelines/score_sde_ve/__init__.py b/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py
similarity index 87%
rename from src/diffusers/pipelines/score_sde_ve/__init__.py
rename to src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py
index 0001394ded5c..87c167c3dbd2 100644
--- a/src/diffusers/pipelines/score_sde_ve/__init__.py
+++ b/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py
@@ -1,6 +1,6 @@
 from typing import TYPE_CHECKING
 
-from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
 
 
 _import_structure = {"pipeline_score_sde_ve": ["ScoreSdeVePipeline"]}
diff --git a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py b/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py
similarity index 95%
rename from src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
rename to src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py
index 11d1af710355..b9b3eb08f845 100644
--- a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
+++ b/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py
@@ -16,10 +16,10 @@
 
 import torch
 
-from ...models import UNet2DModel
-from ...schedulers import ScoreSdeVeScheduler
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ....models import UNet2DModel
+from ....schedulers import ScoreSdeVeScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
 class ScoreSdeVePipeline(DiffusionPipeline):
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py
similarity index 85%
rename from src/diffusers/pipelines/spectrogram_diffusion/__init__.py
rename to src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py
index 2444191368d4..150954baa0eb 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/__init__.py
+++ b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py
@@ -1,7 +1,7 @@
 # flake8: noqa
 from typing import TYPE_CHECKING
-from ...utils import DIFFUSERS_SLOW_IMPORT
-from ...utils import (
+from ....utils import (
+    DIFFUSERS_SLOW_IMPORT,
     _LazyModule,
     is_note_seq_available,
     OptionalDependencyNotAvailable,
@@ -17,7 +17,7 @@
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+    from ....utils import dummy_torch_and_transformers_objects  # noqa F403
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
@@ -32,7 +32,7 @@
     if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils import dummy_transformers_and_torch_and_note_seq_objects
+    from ....utils import dummy_transformers_and_torch_and_note_seq_objects
 
     _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
 else:
@@ -45,7 +45,7 @@
             raise OptionalDependencyNotAvailable()
 
     except OptionalDependencyNotAvailable:
-        from ...utils.dummy_torch_and_transformers_objects import *
+        from ....utils.dummy_torch_and_transformers_objects import *
     else:
         from .pipeline_spectrogram_diffusion import SpectrogramDiffusionPipeline
         from .pipeline_spectrogram_diffusion import SpectrogramContEncoder
@@ -56,7 +56,7 @@
         if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
-        from ...utils.dummy_transformers_and_torch_and_note_seq_objects import *
+        from ....utils.dummy_transformers_and_torch_and_note_seq_objects import *
 
     else:
         from .midi_utils import MidiProcessor
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/continuous_encoder.py b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py
similarity index 96%
rename from src/diffusers/pipelines/spectrogram_diffusion/continuous_encoder.py
rename to src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py
index 556136d4023d..4d4582924144 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/continuous_encoder.py
+++ b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py
@@ -22,8 +22,8 @@
     T5LayerNorm,
 )
 
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models import ModelMixin
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....models import ModelMixin
 
 
 class SpectrogramContEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py
similarity index 99%
rename from src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
rename to src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py
index 08d0878db588..a91233edfe30 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/midi_utils.py
+++ b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py
@@ -22,7 +22,7 @@
 import torch
 import torch.nn.functional as F
 
-from ...utils import is_note_seq_available
+from ....utils import is_note_seq_available
 from .pipeline_spectrogram_diffusion import TARGET_FEATURE_LENGTH
 
 
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/notes_encoder.py b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py
similarity index 96%
rename from src/diffusers/pipelines/spectrogram_diffusion/notes_encoder.py
rename to src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py
index 94eaa176f3e5..f2a1ca24f5ff 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/notes_encoder.py
+++ b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py
@@ -18,8 +18,8 @@
 from transformers.modeling_utils import ModuleUtilsMixin
 from transformers.models.t5.modeling_t5 import T5Block, T5Config, T5LayerNorm
 
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models import ModelMixin
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....models import ModelMixin
 
 
 class SpectrogramNotesEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
diff --git a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
similarity index 97%
rename from src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
rename to src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
index 88725af452c2..b803d921a388 100644
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -19,16 +19,16 @@
 import numpy as np
 import torch
 
-from ...models import T5FilmDecoder
-from ...schedulers import DDPMScheduler
-from ...utils import is_onnx_available, logging
-from ...utils.torch_utils import randn_tensor
+from ....models import T5FilmDecoder
+from ....schedulers import DDPMScheduler
+from ....utils import is_onnx_available, logging
+from ....utils.torch_utils import randn_tensor
 
 
 if is_onnx_available():
-    from ..onnx_utils import OnnxRuntimeModel
+    from ...onnx_utils import OnnxRuntimeModel
 
-from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from ...pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 from .continuous_encoder import SpectrogramContEncoder
 from .notes_encoder import SpectrogramNotesEncoder
 
diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py
new file mode 100644
index 000000000000..36cf1a33ce6a
--- /dev/null
+++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py
@@ -0,0 +1,55 @@
+from typing import TYPE_CHECKING
+
+from ....utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ....utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_cycle_diffusion"] = ["CycleDiffusionPipeline"]
+    _import_structure["pipeline_stable_diffusion_inpaint_legacy"] = ["StableDiffusionInpaintPipelineLegacy"]
+    _import_structure["pipeline_stable_diffusion_model_editing"] = ["StableDiffusionModelEditingPipeline"]
+
+    _import_structure["pipeline_stable_diffusion_paradigms"] = ["StableDiffusionParadigmsPipeline"]
+    _import_structure["pipeline_stable_diffusion_pix2pix_zero"] = ["StableDiffusionPix2PixZeroPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ....utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .pipeline_cycle_diffusion import CycleDiffusionPipeline
+        from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
+        from .pipeline_stable_diffusion_model_editing import StableDiffusionModelEditingPipeline
+        from .pipeline_stable_diffusion_paradigms import StableDiffusionParadigmsPipeline
+        from .pipeline_stable_diffusion_pix2pix_zero import StableDiffusionPix2PixZeroPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py
similarity index 98%
rename from src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
rename to src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py
index 7b87c1065bd1..9d2b3ca8abaf 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py
@@ -21,17 +21,17 @@
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...configuration_utils import FrozenDict
-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.lora import adjust_lora_scale_text_encoder
-from ...schedulers import DDIMScheduler
-from ...utils import PIL_INTERPOLATION, USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from .pipeline_output import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
+from ....configuration_utils import FrozenDict
+from ....image_processor import PipelineImageInput, VaeImageProcessor
+from ....loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import DDIMScheduler
+from ....utils import PIL_INTERPOLATION, USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py
similarity index 98%
rename from src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
rename to src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py
index 40abc477e7c0..0aa5e68bfcb4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py
@@ -6,12 +6,12 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPTokenizer
 
-from ...configuration_utils import FrozenDict
-from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ...utils import deprecate, logging
-from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
+from ....configuration_utils import FrozenDict
+from ....schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ....utils import deprecate, logging
+from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
similarity index 98%
rename from src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
rename to src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
index 15e6f60569a3..4daa1c07f0c6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
@@ -21,17 +21,17 @@
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...configuration_utils import FrozenDict
-from ...image_processor import VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.lora import adjust_lora_scale_text_encoder
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
+from ....configuration_utils import FrozenDict
+from ....image_processor import VaeImageProcessor
+from ....loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import PIL_INTERPOLATION, USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
similarity index 98%
rename from src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
rename to src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
index 5d1c2983d448..1ee0e0161db9 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
+++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
@@ -18,17 +18,17 @@
 import torch
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import VaeImageProcessor
-from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.lora import adjust_lora_scale_text_encoder
-from ...schedulers import PNDMScheduler
-from ...schedulers.scheduling_utils import SchedulerMixin
-from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
+from ....image_processor import VaeImageProcessor
+from ....loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import PNDMScheduler
+from ....schedulers.scheduling_utils import SchedulerMixin
+from ....utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
similarity index 98%
rename from src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py
rename to src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
index f0368b4ca305..3c9d744c6dfa 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py
+++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
@@ -18,12 +18,12 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...image_processor import VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.lora import adjust_lora_scale_text_encoder
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (
+from ....image_processor import VaeImageProcessor
+from ....loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import (
     USE_PEFT_BACKEND,
     deprecate,
     logging,
@@ -31,10 +31,10 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
similarity index 98%
rename from src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
rename to src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
index 9b3e43480fb8..7afb1f7e3ad3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
@@ -28,14 +28,14 @@
     CLIPTokenizer,
 )
 
-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.attention_processor import Attention
-from ...models.lora import adjust_lora_scale_text_encoder
-from ...schedulers import DDIMScheduler, DDPMScheduler, EulerAncestralDiscreteScheduler, LMSDiscreteScheduler
-from ...schedulers.scheduling_ddim_inverse import DDIMInverseScheduler
-from ...utils import (
+from ....image_processor import PipelineImageInput, VaeImageProcessor
+from ....loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.attention_processor import Attention
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import DDIMScheduler, DDPMScheduler, EulerAncestralDiscreteScheduler, LMSDiscreteScheduler
+from ....schedulers.scheduling_ddim_inverse import DDIMInverseScheduler
+from ....utils import (
     PIL_INTERPOLATION,
     USE_PEFT_BACKEND,
     BaseOutput,
@@ -45,10 +45,10 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/src/diffusers/pipelines/stochastic_karras_ve/__init__.py b/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py
similarity index 87%
rename from src/diffusers/pipelines/stochastic_karras_ve/__init__.py
rename to src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py
index 539e920e6dec..15c9a8c27f98 100644
--- a/src/diffusers/pipelines/stochastic_karras_ve/__init__.py
+++ b/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py
@@ -1,6 +1,6 @@
 from typing import TYPE_CHECKING
 
-from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
 
 
 _import_structure = {"pipeline_stochastic_karras_ve": ["KarrasVePipeline"]}
diff --git a/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py b/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
similarity index 96%
rename from src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
rename to src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
index d850f5a73351..55ca6186626d 100644
--- a/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
+++ b/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
@@ -16,10 +16,10 @@
 
 import torch
 
-from ...models import UNet2DModel
-from ...schedulers import KarrasVeScheduler
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ....models import UNet2DModel
+from ....schedulers import KarrasVeScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
 class KarrasVePipeline(DiffusionPipeline):
diff --git a/src/diffusers/pipelines/versatile_diffusion/__init__.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py
similarity index 94%
rename from src/diffusers/pipelines/versatile_diffusion/__init__.py
rename to src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py
index 6eafd5125e32..8ea6ef6e2f65 100644
--- a/src/diffusers/pipelines/versatile_diffusion/__init__.py
+++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py
@@ -1,6 +1,6 @@
 from typing import TYPE_CHECKING
 
-from ...utils import (
+from ....utils import (
     DIFFUSERS_SLOW_IMPORT,
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -17,7 +17,7 @@
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import (
+    from ....utils.dummy_torch_and_transformers_objects import (
         VersatileDiffusionDualGuidedPipeline,
         VersatileDiffusionImageVariationPipeline,
         VersatileDiffusionPipeline,
@@ -45,7 +45,7 @@
         if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
-        from ...utils.dummy_torch_and_transformers_objects import (
+        from ....utils.dummy_torch_and_transformers_objects import (
             VersatileDiffusionDualGuidedPipeline,
             VersatileDiffusionImageVariationPipeline,
             VersatileDiffusionPipeline,
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
similarity index 99%
rename from src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
rename to src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
index 8ac63636df86..112aa42323f9 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
@@ -7,10 +7,10 @@
 
 from diffusers.utils import deprecate
 
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models import ModelMixin
-from ...models.activations import get_activation
-from ...models.attention_processor import (
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....models import ModelMixin
+from ....models.activations import get_activation
+from ....models.attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
     Attention,
@@ -19,8 +19,8 @@
     AttnAddedKVProcessor2_0,
     AttnProcessor,
 )
-from ...models.dual_transformer_2d import DualTransformer2DModel
-from ...models.embeddings import (
+from ....models.dual_transformer_2d import DualTransformer2DModel
+from ....models.embeddings import (
     GaussianFourierProjection,
     ImageHintTimeEmbedding,
     ImageProjection,
@@ -31,10 +31,10 @@
     TimestepEmbedding,
     Timesteps,
 )
-from ...models.transformer_2d import Transformer2DModel
-from ...models.unet_2d_condition import UNet2DConditionOutput
-from ...utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
-from ...utils.torch_utils import apply_freeu
+from ....models.transformer_2d import Transformer2DModel
+from ....models.unet_2d_condition import UNet2DConditionOutput
+from ....utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from ....utils.torch_utils import apply_freeu
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py
similarity index 99%
rename from src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
rename to src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py
index 68c720ab2ad0..4455d20df213 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
+++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py
@@ -5,10 +5,10 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModel
 
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging
-from ..pipeline_utils import DiffusionPipeline
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import logging
+from ...pipeline_utils import DiffusionPipeline
 from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline
 from .pipeline_versatile_diffusion_image_variation import VersatileDiffusionImageVariationPipeline
 from .pipeline_versatile_diffusion_text_to_image import VersatileDiffusionTextToImagePipeline
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
similarity index 98%
rename from src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
rename to src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
index 8f8bf260ca56..168e6a44a5c9 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -26,12 +26,12 @@
     CLIPVisionModelWithProjection,
 )
 
-from ...image_processor import VaeImageProcessor
-from ...models import AutoencoderKL, DualTransformer2DModel, Transformer2DModel, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ....image_processor import VaeImageProcessor
+from ....models import AutoencoderKL, DualTransformer2DModel, Transformer2DModel, UNet2DConditionModel
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import deprecate, logging
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .modeling_text_unet import UNetFlatConditionModel
 
 
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
similarity index 98%
rename from src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
rename to src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
index bcad6f93ef96..a2111283a6dd 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -21,12 +21,12 @@
 import torch.utils.checkpoint
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 
-from ...image_processor import VaeImageProcessor
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ....image_processor import VaeImageProcessor
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import deprecate, logging
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
similarity index 98%
rename from src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
rename to src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
index d8f947e64af7..de6ab3891214 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -19,12 +19,12 @@
 import torch.utils.checkpoint
 from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer
 
-from ...image_processor import VaeImageProcessor
-from ...models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ....image_processor import VaeImageProcessor
+from ....models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import deprecate, logging
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .modeling_text_unet import UNetFlatConditionModel
 
 
diff --git a/src/diffusers/pipelines/vq_diffusion/__init__.py b/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py
similarity index 90%
rename from src/diffusers/pipelines/vq_diffusion/__init__.py
rename to src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py
index c2a22e76ae45..070903377c71 100644
--- a/src/diffusers/pipelines/vq_diffusion/__init__.py
+++ b/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py
@@ -1,6 +1,6 @@
 from typing import TYPE_CHECKING
 
-from ...utils import (
+from ....utils import (
     DIFFUSERS_SLOW_IMPORT,
     OptionalDependencyNotAvailable,
     _LazyModule,
@@ -16,7 +16,7 @@
     if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import (
+    from ....utils.dummy_torch_and_transformers_objects import (
         LearnedClassifierFreeSamplingEmbeddings,
         VQDiffusionPipeline,
     )
@@ -36,7 +36,7 @@
         if not (is_transformers_available() and is_torch_available()):
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
-        from ...utils.dummy_torch_and_transformers_objects import (
+        from ....utils.dummy_torch_and_transformers_objects import (
             LearnedClassifierFreeSamplingEmbeddings,
             VQDiffusionPipeline,
         )
diff --git a/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py b/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py
similarity index 98%
rename from src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
rename to src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py
index 1abe50a9b6b6..d7c2945b463c 100644
--- a/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
+++ b/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py
@@ -17,11 +17,11 @@
 import torch
 from transformers import CLIPTextModel, CLIPTokenizer
 
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models import ModelMixin, Transformer2DModel, VQModel
-from ...schedulers import VQDiffusionScheduler
-from ...utils import logging
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....models import ModelMixin, Transformer2DModel, VQModel
+from ....schedulers import VQDiffusionScheduler
+from ....utils import logging
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index d81831082e2f..dbd79ec1f367 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -134,7 +134,6 @@
 
     else:
         from .clip_image_project_model import CLIPImageProjection
-        from .pipeline_cycle_diffusion import CycleDiffusionPipeline
         from .pipeline_stable_diffusion import (
             StableDiffusionPipeline,
             StableDiffusionPipelineOutput,
@@ -149,9 +148,6 @@
         )
         from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
         from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
-        from .pipeline_stable_diffusion_inpaint_legacy import (
-            StableDiffusionInpaintPipelineLegacy,
-        )
         from .pipeline_stable_diffusion_instruct_pix2pix import (
             StableDiffusionInstructPix2PixPipeline,
         )
@@ -159,13 +155,7 @@
             StableDiffusionLatentUpscalePipeline,
         )
         from .pipeline_stable_diffusion_ldm3d import StableDiffusionLDM3DPipeline
-        from .pipeline_stable_diffusion_model_editing import (
-            StableDiffusionModelEditingPipeline,
-        )
         from .pipeline_stable_diffusion_panorama import StableDiffusionPanoramaPipeline
-        from .pipeline_stable_diffusion_paradigms import (
-            StableDiffusionParadigmsPipeline,
-        )
         from .pipeline_stable_diffusion_sag import StableDiffusionSAGPipeline
         from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline
         from .pipeline_stable_unclip import StableUnCLIPPipeline
@@ -199,9 +189,6 @@
             StableDiffusionDepth2ImgPipeline,
         )
         from .pipeline_stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
-        from .pipeline_stable_diffusion_pix2pix_zero import (
-            StableDiffusionPix2PixZeroPipeline,
-        )
 
     try:
         if not (
@@ -234,9 +221,6 @@
         from .pipeline_onnx_stable_diffusion_inpaint import (
             OnnxStableDiffusionInpaintPipeline,
         )
-        from .pipeline_onnx_stable_diffusion_inpaint_legacy import (
-            OnnxStableDiffusionInpaintPipelineLegacy,
-        )
         from .pipeline_onnx_stable_diffusion_upscale import (
             OnnxStableDiffusionUpscalePipeline,
         )
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
index 3d48c811cdf1..81d936be62b4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
@@ -788,7 +788,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
         latents = latents * self.scheduler.init_noise_sigma
         return latents
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_pix2pix_zero.StableDiffusionPix2PixZeroPipeline.prepare_image_latents
     def prepare_image_latents(self, image, batch_size, dtype, device, generator=None):
         if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
             raise ValueError(
diff --git a/tests/pipelines/altdiffusion/__init__.py b/tests/pipelines/altdiffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py
deleted file mode 100644
index b4a2847bb84d..000000000000
--- a/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, XLMRobertaTokenizer
-
-from diffusers import AltDiffusionPipeline, AutoencoderKL, DDIMScheduler, PNDMScheduler, UNet2DConditionModel
-from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-    RobertaSeriesConfig,
-    RobertaSeriesModelWithTransformation,
-)
-from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
-
-from ..pipeline_params import (
-    TEXT_TO_IMAGE_BATCH_PARAMS,
-    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
-    TEXT_TO_IMAGE_IMAGE_PARAMS,
-    TEXT_TO_IMAGE_PARAMS,
-)
-from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
-
-
-enable_full_determinism()
-
-
-class AltDiffusionPipelineFastTests(
-    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
-):
-    pipeline_class = AltDiffusionPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
-    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
-    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-
-        # TODO: address the non-deterministic text encoder (fails for save-load tests)
-        # torch.manual_seed(0)
-        # text_encoder_config = RobertaSeriesConfig(
-        #     hidden_size=32,
-        #     project_dim=32,
-        #     intermediate_size=37,
-        #     layer_norm_eps=1e-05,
-        #     num_attention_heads=4,
-        #     num_hidden_layers=5,
-        #     vocab_size=5002,
-        # )
-        # text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
-
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            projection_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=5002,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-
-        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
-        tokenizer.model_max_length = 77
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-            "image_encoder": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_attention_slicing_forward_pass(self):
-        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
-
-    def test_inference_batch_single_identical(self):
-        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
-
-    def test_alt_diffusion_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        torch.manual_seed(0)
-        text_encoder_config = RobertaSeriesConfig(
-            hidden_size=32,
-            project_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            vocab_size=5002,
-        )
-        # TODO: remove after fixing the non-deterministic text encoder
-        text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
-        components["text_encoder"] = text_encoder
-
-        alt_pipe = AltDiffusionPipeline(**components)
-        alt_pipe = alt_pipe.to(device)
-        alt_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = "A photo of an astronaut"
-        output = alt_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.5748162, 0.60447145, 0.48821217, 0.50100636, 0.5431185, 0.45763683, 0.49657696, 0.48132733, 0.47573093]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_alt_diffusion_pndm(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        torch.manual_seed(0)
-        text_encoder_config = RobertaSeriesConfig(
-            hidden_size=32,
-            project_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            vocab_size=5002,
-        )
-        # TODO: remove after fixing the non-deterministic text encoder
-        text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
-        components["text_encoder"] = text_encoder
-        alt_pipe = AltDiffusionPipeline(**components)
-        alt_pipe = alt_pipe.to(device)
-        alt_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = alt_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.51605093, 0.5707241, 0.47365507, 0.50578886, 0.5633877, 0.4642503, 0.5182081, 0.48763484, 0.49084237]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-
-@nightly
-@require_torch_gpu
-class AltDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_alt_diffusion(self):
-        # make sure here that pndm scheduler skips prk
-        alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", safety_checker=None)
-        alt_pipe = alt_pipe.to(torch_device)
-        alt_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        output = alt_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="np")
-
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.1010, 0.0800, 0.0794, 0.0885, 0.0843, 0.0762, 0.0769, 0.0729, 0.0586])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_alt_diffusion_fast_ddim(self):
-        scheduler = DDIMScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler")
-
-        alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=scheduler, safety_checker=None)
-        alt_pipe = alt_pipe.to(torch_device)
-        alt_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-
-        output = alt_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy")
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.4019, 0.4052, 0.3810, 0.4119, 0.3916, 0.3982, 0.4651, 0.4195, 0.5323])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
deleted file mode 100644
index 3fd1a90172ca..000000000000
--- a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from transformers import XLMRobertaTokenizer
-
-from diffusers import (
-    AltDiffusionImg2ImgPipeline,
-    AutoencoderKL,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from diffusers.image_processor import VaeImageProcessor
-from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-    RobertaSeriesConfig,
-    RobertaSeriesModelWithTransformation,
-)
-from diffusers.utils import load_image
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    load_numpy,
-    nightly,
-    require_torch_gpu,
-    torch_device,
-)
-
-
-enable_full_determinism()
-
-
-class AltDiffusionImg2ImgPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
-        return image
-
-    @property
-    def dummy_cond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        torch.manual_seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = RobertaSeriesConfig(
-            hidden_size=32,
-            project_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=5006,
-        )
-        return RobertaSeriesModelWithTransformation(config)
-
-    @property
-    def dummy_extractor(self):
-        def extract(*args, **kwargs):
-            class Out:
-                def __init__(self):
-                    self.pixel_values = torch.ones([0])
-
-                def to(self, device):
-                    self.pixel_values.to(device)
-                    return self
-
-            return Out()
-
-        return extract
-
-    def test_stable_diffusion_img2img_default_case(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
-        tokenizer.model_max_length = 77
-
-        init_image = self.dummy_image.to(device)
-        init_image = init_image / 2 + 0.5
-
-        # make sure here that pndm scheduler skips prk
-        alt_pipe = AltDiffusionImg2ImgPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-            image_encoder=None,
-        )
-        alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=True)
-        alt_pipe = alt_pipe.to(device)
-        alt_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = alt_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-        )
-
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = alt_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4427, 0.3731, 0.4249, 0.4941, 0.4546, 0.4148, 0.4193, 0.4666, 0.4499])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 5e-3
-
-    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
-    def test_stable_diffusion_img2img_fp16(self):
-        """Test that stable diffusion img2img works with fp16"""
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
-        tokenizer.model_max_length = 77
-
-        init_image = self.dummy_image.to(torch_device)
-
-        # put models in fp16
-        unet = unet.half()
-        vae = vae.half()
-        bert = bert.half()
-
-        # make sure here that pndm scheduler skips prk
-        alt_pipe = AltDiffusionImg2ImgPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-            image_encoder=None,
-        )
-        alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False)
-        alt_pipe = alt_pipe.to(torch_device)
-        alt_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        image = alt_pipe(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-        ).images
-
-        assert image.shape == (1, 32, 32, 3)
-
-    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
-    def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        # resize to resolution that is divisible by 8 but not 16 or 32
-        init_image = init_image.resize((760, 504))
-
-        model_id = "BAAI/AltDiffusion"
-        pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
-            model_id,
-            safety_checker=None,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "A fantasy landscape, trending on artstation"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            strength=0.75,
-            guidance_scale=7.5,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        image_slice = image[255:258, 383:386, -1]
-
-        assert image.shape == (504, 760, 3)
-        expected_slice = np.array([0.9358, 0.9397, 0.9599, 0.9901, 1.0000, 1.0000, 0.9882, 1.0000, 1.0000])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-
-@nightly
-@require_torch_gpu
-class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_stable_diffusion_img2img_pipeline_default(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        init_image = init_image.resize((768, 512))
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_alt.npy"
-        )
-
-        model_id = "BAAI/AltDiffusion"
-        pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
-            model_id,
-            safety_checker=None,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "A fantasy landscape, trending on artstation"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            strength=0.75,
-            guidance_scale=7.5,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        assert image.shape == (512, 768, 3)
-        # img2img is flaky across GPUs even in fp32, so using MAE here
-        assert np.abs(expected_image - image).max() < 1e-2
diff --git a/tests/pipelines/audio_diffusion/__init__.py b/tests/pipelines/audio_diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/pipelines/audio_diffusion/test_audio_diffusion.py b/tests/pipelines/audio_diffusion/test_audio_diffusion.py
deleted file mode 100644
index 2cf3e4a95609..000000000000
--- a/tests/pipelines/audio_diffusion/test_audio_diffusion.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import (
-    AudioDiffusionPipeline,
-    AutoencoderKL,
-    DDIMScheduler,
-    DDPMScheduler,
-    DiffusionPipeline,
-    Mel,
-    UNet2DConditionModel,
-    UNet2DModel,
-)
-from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
-
-
-enable_full_determinism()
-
-
-class PipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @property
-    def dummy_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DModel(
-            sample_size=(32, 64),
-            in_channels=1,
-            out_channels=1,
-            layers_per_block=2,
-            block_out_channels=(128, 128),
-            down_block_types=("AttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "AttnUpBlock2D"),
-        )
-        return model
-
-    @property
-    def dummy_unet_condition(self):
-        torch.manual_seed(0)
-        model = UNet2DConditionModel(
-            sample_size=(64, 32),
-            in_channels=1,
-            out_channels=1,
-            layers_per_block=2,
-            block_out_channels=(128, 128),
-            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
-            cross_attention_dim=10,
-        )
-        return model
-
-    @property
-    def dummy_vqvae_and_unet(self):
-        torch.manual_seed(0)
-        vqvae = AutoencoderKL(
-            sample_size=(128, 64),
-            in_channels=1,
-            out_channels=1,
-            latent_channels=1,
-            layers_per_block=2,
-            block_out_channels=(128, 128),
-            down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
-            up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
-        )
-        unet = UNet2DModel(
-            sample_size=(64, 32),
-            in_channels=1,
-            out_channels=1,
-            layers_per_block=2,
-            block_out_channels=(128, 128),
-            down_block_types=("AttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "AttnUpBlock2D"),
-        )
-        return vqvae, unet
-
-    @nightly
-    def test_audio_diffusion(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        mel = Mel(
-            x_res=self.dummy_unet.config.sample_size[1],
-            y_res=self.dummy_unet.config.sample_size[0],
-        )
-
-        scheduler = DDPMScheduler()
-        pipe = AudioDiffusionPipeline(vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device=device).manual_seed(42)
-        output = pipe(generator=generator, steps=4)
-        audio = output.audios[0]
-        image = output.images[0]
-
-        generator = torch.Generator(device=device).manual_seed(42)
-        output = pipe(generator=generator, steps=4, return_dict=False)
-        image_from_tuple = output[0][0]
-
-        assert audio.shape == (1, (self.dummy_unet.config.sample_size[1] - 1) * mel.hop_length)
-        assert (
-            image.height == self.dummy_unet.config.sample_size[0]
-            and image.width == self.dummy_unet.config.sample_size[1]
-        )
-        image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        image_from_tuple_slice = np.frombuffer(image_from_tuple.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array([69, 255, 255, 255, 0, 0, 77, 181, 12, 127])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() == 0
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() == 0
-
-        mel = Mel(
-            x_res=self.dummy_vqvae_and_unet[0].config.sample_size[1],
-            y_res=self.dummy_vqvae_and_unet[0].config.sample_size[0],
-        )
-
-        scheduler = DDIMScheduler()
-        dummy_vqvae_and_unet = self.dummy_vqvae_and_unet
-        pipe = AudioDiffusionPipeline(
-            vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_vqvae_and_unet[1], mel=mel, scheduler=scheduler
-        )
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        np.random.seed(0)
-        raw_audio = np.random.uniform(-1, 1, ((dummy_vqvae_and_unet[0].config.sample_size[1] - 1) * mel.hop_length,))
-        generator = torch.Generator(device=device).manual_seed(42)
-        output = pipe(raw_audio=raw_audio, generator=generator, start_step=5, steps=10)
-        image = output.images[0]
-
-        assert (
-            image.height == self.dummy_vqvae_and_unet[0].config.sample_size[0]
-            and image.width == self.dummy_vqvae_and_unet[0].config.sample_size[1]
-        )
-        image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array([120, 117, 110, 109, 138, 167, 138, 148, 132, 121])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() == 0
-
-        dummy_unet_condition = self.dummy_unet_condition
-        pipe = AudioDiffusionPipeline(
-            vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_unet_condition, mel=mel, scheduler=scheduler
-        )
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        np.random.seed(0)
-        encoding = torch.rand((1, 1, 10))
-        output = pipe(generator=generator, encoding=encoding)
-        image = output.images[0]
-        image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array([107, 103, 120, 127, 142, 122, 113, 122, 97, 111])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() == 0
-
-
-@nightly
-@require_torch_gpu
-class PipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_audio_diffusion(self):
-        device = torch_device
-
-        pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device=device).manual_seed(42)
-        output = pipe(generator=generator)
-        audio = output.audios[0]
-        image = output.images[0]
-
-        assert audio.shape == (1, (pipe.unet.config.sample_size[1] - 1) * pipe.mel.hop_length)
-        assert image.height == pipe.unet.config.sample_size[0] and image.width == pipe.unet.config.sample_size[1]
-        image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array([151, 167, 154, 144, 122, 134, 121, 105, 70, 26])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() == 0
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
deleted file mode 100644
index 4d284a494fba..000000000000
--- a/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel
-
-from diffusers import DDIMScheduler, LDMPipeline, UNet2DModel, VQModel
-from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch, torch_device
-
-
-enable_full_determinism()
-
-
-class LDMPipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_uncond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    @property
-    def dummy_vq_model(self):
-        torch.manual_seed(0)
-        model = VQModel(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config)
-
-    def test_inference_uncond(self):
-        unet = self.dummy_uncond_unet
-        scheduler = DDIMScheduler()
-        vae = self.dummy_vq_model
-
-        ldm = LDMPipeline(unet=unet, vqvae=vae, scheduler=scheduler)
-        ldm.to(torch_device)
-        ldm.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = ldm(generator=generator, num_inference_steps=2, output_type="numpy").images
-
-        generator = torch.manual_seed(0)
-        image_from_tuple = ldm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.8512, 0.818, 0.6411, 0.6808, 0.4465, 0.5618, 0.46, 0.6231, 0.5172])
-        tolerance = 1e-2 if torch_device != "mps" else 3e-2
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < tolerance
-
-
-@nightly
-@require_torch
-class LDMPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_uncond(self):
-        ldm = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
-        ldm.to(torch_device)
-        ldm.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = ldm(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.4399, 0.44975, 0.46825, 0.474, 0.4359, 0.4581, 0.45095, 0.4341, 0.4447])
-        tolerance = 1e-2 if torch_device != "mps" else 3e-2
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
diff --git a/tests/pipelines/repaint/__init__.py b/tests/pipelines/repaint/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/pipelines/repaint/test_repaint.py b/tests/pipelines/repaint/test_repaint.py
deleted file mode 100644
index 607827854bf7..000000000000
--- a/tests/pipelines/repaint/test_repaint.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import RePaintPipeline, RePaintScheduler, UNet2DModel
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    load_image,
-    load_numpy,
-    nightly,
-    require_torch_gpu,
-    skip_mps,
-    torch_device,
-)
-
-from ..pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-enable_full_determinism()
-
-
-class RepaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = RePaintPipeline
-    params = IMAGE_INPAINTING_PARAMS - {"width", "height", "guidance_scale"}
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents",
-        "num_images_per_prompt",
-        "callback",
-        "callback_steps",
-    }
-    batch_params = IMAGE_INPAINTING_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        torch.manual_seed(0)
-        unet = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        scheduler = RePaintScheduler()
-        components = {"unet": unet, "scheduler": scheduler}
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        image = np.random.RandomState(seed).standard_normal((1, 3, 32, 32))
-        image = torch.from_numpy(image).to(device=device, dtype=torch.float32)
-        mask = (image > 0).to(device=device, dtype=torch.float32)
-        inputs = {
-            "image": image,
-            "mask_image": mask,
-            "generator": generator,
-            "num_inference_steps": 5,
-            "eta": 0.0,
-            "jump_length": 2,
-            "jump_n_sample": 2,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_repaint(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = RePaintPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([1.0000, 0.5426, 0.5497, 0.2200, 1.0000, 1.0000, 0.5623, 1.0000, 0.6274])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    @skip_mps
-    def test_save_load_local(self):
-        return super().test_save_load_local()
-
-    # RePaint can hardly be made deterministic since the scheduler is currently always
-    # nondeterministic
-    @unittest.skip("non-deterministic pipeline")
-    def test_inference_batch_single_identical(self):
-        return super().test_inference_batch_single_identical()
-
-    @skip_mps
-    def test_dict_tuple_outputs_equivalent(self):
-        return super().test_dict_tuple_outputs_equivalent()
-
-    @skip_mps
-    def test_save_load_optional_components(self):
-        return super().test_save_load_optional_components()
-
-    @skip_mps
-    def test_attention_slicing_forward_pass(self):
-        return super().test_attention_slicing_forward_pass()
-
-
-@nightly
-@require_torch_gpu
-class RepaintPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_celebahq(self):
-        original_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
-            "repaint/celeba_hq_256.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
-        )
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
-            "repaint/celeba_hq_256_result.npy"
-        )
-
-        model_id = "google/ddpm-ema-celebahq-256"
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = RePaintScheduler.from_pretrained(model_id)
-
-        repaint = RePaintPipeline(unet=unet, scheduler=scheduler).to(torch_device)
-        repaint.set_progress_bar_config(disable=None)
-        repaint.enable_attention_slicing()
-
-        generator = torch.manual_seed(0)
-        output = repaint(
-            original_image,
-            mask_image,
-            num_inference_steps=250,
-            eta=0.0,
-            jump_length=10,
-            jump_n_sample=10,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        assert image.shape == (256, 256, 3)
-        assert np.abs(expected_image - image).mean() < 1e-2
diff --git a/tests/pipelines/score_sde_ve/__init__.py b/tests/pipelines/score_sde_ve/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/pipelines/score_sde_ve/test_score_sde_ve.py b/tests/pipelines/score_sde_ve/test_score_sde_ve.py
deleted file mode 100644
index fd8c77b6e41f..000000000000
--- a/tests/pipelines/score_sde_ve/test_score_sde_ve.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import ScoreSdeVePipeline, ScoreSdeVeScheduler, UNet2DModel
-from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch, torch_device
-
-
-enable_full_determinism()
-
-
-class ScoreSdeVeipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_uncond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    def test_inference(self):
-        unet = self.dummy_uncond_unet
-        scheduler = ScoreSdeVeScheduler()
-
-        sde_ve = ScoreSdeVePipeline(unet=unet, scheduler=scheduler)
-        sde_ve.to(torch_device)
-        sde_ve.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator).images
-
-        generator = torch.manual_seed(0)
-        image_from_tuple = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator, return_dict=False)[
-            0
-        ]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-
-@nightly
-@require_torch
-class ScoreSdeVePipelineIntegrationTests(unittest.TestCase):
-    def test_inference(self):
-        model_id = "google/ncsnpp-church-256"
-        model = UNet2DModel.from_pretrained(model_id)
-
-        scheduler = ScoreSdeVeScheduler.from_pretrained(model_id)
-
-        sde_ve = ScoreSdeVePipeline(unet=model, scheduler=scheduler)
-        sde_ve.to(torch_device)
-        sde_ve.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = sde_ve(num_inference_steps=10, output_type="numpy", generator=generator).images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-
-        expected_slice = np.array([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/spectrogram_diffusion/__init__.py b/tests/pipelines/spectrogram_diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
deleted file mode 100644
index 1d00c7e963bb..000000000000
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline
-from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    nightly,
-    require_note_seq,
-    require_onnxruntime,
-    require_torch_gpu,
-    skip_mps,
-    torch_device,
-)
-
-from ..pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-enable_full_determinism()
-
-
-MIDI_FILE = "./tests/fixtures/elise_format0.mid"
-
-
-# The note-seq package throws an error on import because the default installed version of Ipython
-# is not compatible with python 3.8 which we run in the CI.
-# https://github.com/huggingface/diffusers/actions/runs/4830121056/jobs/8605954838#step:7:98
-@unittest.skip("The note-seq package currently throws an error on import")
-class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = SpectrogramDiffusionPipeline
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "callback",
-        "latents",
-        "callback_steps",
-        "output_type",
-        "num_images_per_prompt",
-    }
-    test_attention_slicing = False
-
-    batch_params = TOKENS_TO_AUDIO_GENERATION_PARAMS
-    params = TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        notes_encoder = SpectrogramNotesEncoder(
-            max_length=2048,
-            vocab_size=1536,
-            d_model=768,
-            dropout_rate=0.1,
-            num_layers=1,
-            num_heads=1,
-            d_kv=4,
-            d_ff=2048,
-            feed_forward_proj="gated-gelu",
-        )
-
-        continuous_encoder = SpectrogramContEncoder(
-            input_dims=128,
-            targets_context_length=256,
-            d_model=768,
-            dropout_rate=0.1,
-            num_layers=1,
-            num_heads=1,
-            d_kv=4,
-            d_ff=2048,
-            feed_forward_proj="gated-gelu",
-        )
-
-        decoder = T5FilmDecoder(
-            input_dims=128,
-            targets_length=256,
-            max_decoder_noise_time=20000.0,
-            d_model=768,
-            num_layers=1,
-            num_heads=1,
-            d_kv=4,
-            d_ff=2048,
-            dropout_rate=0.1,
-        )
-
-        scheduler = DDPMScheduler()
-
-        components = {
-            "notes_encoder": notes_encoder.eval(),
-            "continuous_encoder": continuous_encoder.eval(),
-            "decoder": decoder.eval(),
-            "scheduler": scheduler,
-            "melgan": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "input_tokens": [
-                [1134, 90, 1135, 1133, 1080, 112, 1132, 1080, 1133, 1079, 133, 1132, 1079, 1133, 1] + [0] * 2033
-            ],
-            "generator": generator,
-            "num_inference_steps": 4,
-            "output_type": "mel",
-        }
-        return inputs
-
-    def test_spectrogram_diffusion(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        pipe = SpectrogramDiffusionPipeline(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = pipe(**inputs)
-        mel = output.audios
-
-        mel_slice = mel[0, -3:, -3:]
-
-        assert mel_slice.shape == (3, 3)
-        expected_slice = np.array(
-            [-11.512925, -4.788215, -0.46172905, -2.051715, -10.539147, -10.970963, -9.091634, 4.0, 4.0]
-        )
-        assert np.abs(mel_slice.flatten() - expected_slice).max() < 1e-2
-
-    @skip_mps
-    def test_save_load_local(self):
-        return super().test_save_load_local()
-
-    @skip_mps
-    def test_dict_tuple_outputs_equivalent(self):
-        return super().test_dict_tuple_outputs_equivalent()
-
-    @skip_mps
-    def test_save_load_optional_components(self):
-        return super().test_save_load_optional_components()
-
-    @skip_mps
-    def test_attention_slicing_forward_pass(self):
-        return super().test_attention_slicing_forward_pass()
-
-    def test_inference_batch_single_identical(self):
-        pass
-
-    def test_inference_batch_consistent(self):
-        pass
-
-    @skip_mps
-    def test_progress_bar(self):
-        return super().test_progress_bar()
-
-
-@nightly
-@require_torch_gpu
-@require_onnxruntime
-@require_note_seq
-class PipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_callback(self):
-        # TODO - test that pipeline can decode tokens in a callback
-        # so that music can be played live
-        device = torch_device
-
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
-        melgan = pipe.melgan
-        pipe.melgan = None
-
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        def callback(step, mel_output):
-            # decode mel to audio
-            audio = melgan(input_features=mel_output.astype(np.float32))[0]
-            assert len(audio[0]) == 81920 * (step + 1)
-            # simulate that audio is played
-            return audio
-
-        processor = MidiProcessor()
-        input_tokens = processor(MIDI_FILE)
-
-        input_tokens = input_tokens[:3]
-        generator = torch.manual_seed(0)
-        pipe(input_tokens, num_inference_steps=5, generator=generator, callback=callback, output_type="mel")
-
-    def test_spectrogram_fast(self):
-        device = torch_device
-
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-        processor = MidiProcessor()
-
-        input_tokens = processor(MIDI_FILE)
-        # just run two denoising loops
-        input_tokens = input_tokens[:2]
-
-        generator = torch.manual_seed(0)
-        output = pipe(input_tokens, num_inference_steps=2, generator=generator)
-
-        audio = output.audios[0]
-
-        assert abs(np.abs(audio).sum() - 3612.841) < 1e-1
-
-    def test_spectrogram(self):
-        device = torch_device
-
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        processor = MidiProcessor()
-
-        input_tokens = processor(MIDI_FILE)
-
-        # just run 4 denoising loops
-        input_tokens = input_tokens[:4]
-
-        generator = torch.manual_seed(0)
-        output = pipe(input_tokens, num_inference_steps=100, generator=generator)
-
-        audio = output.audios[0]
-        assert abs(np.abs(audio).sum() - 9389.1111) < 5e-2
diff --git a/tests/pipelines/versatile_diffusion/__init__.py b/tests/pipelines/versatile_diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
deleted file mode 100644
index bb8584192ff0..000000000000
--- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import VersatileDiffusionDualGuidedPipeline
-from diffusers.utils.testing_utils import load_image, nightly, require_torch_gpu, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-@nightly
-@require_torch_gpu
-class VersatileDiffusionDualGuidedPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_remove_unused_weights_save_load(self):
-        pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
-        # remove text_unet
-        pipe.remove_unused_weights()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        second_prompt = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-
-        generator = torch.manual_seed(0)
-        image = pipe(
-            prompt="first prompt",
-            image=second_prompt,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        ).images
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(tmpdirname)
-
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = generator.manual_seed(0)
-        new_image = pipe(
-            prompt="first prompt",
-            image=second_prompt,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        ).images
-
-        assert np.abs(image - new_image).max() < 1e-5, "Models don't have the same forward pass"
-
-    def test_inference_dual_guided(self):
-        pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
-        pipe.remove_unused_weights()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        first_prompt = "cyberpunk 2077"
-        second_prompt = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-        generator = torch.manual_seed(0)
-        image = pipe(
-            prompt=first_prompt,
-            image=second_prompt,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=50,
-            output_type="numpy",
-        ).images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.0787, 0.0849, 0.0826, 0.0812, 0.0807, 0.0795, 0.0818, 0.0798, 0.0779])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
deleted file mode 100644
index 1f312a0b71ce..000000000000
--- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import VersatileDiffusionImageVariationPipeline
-from diffusers.utils.testing_utils import load_image, nightly, require_torch_gpu, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class VersatileDiffusionImageVariationPipelineFastTests(unittest.TestCase):
-    pass
-
-
-@nightly
-@require_torch_gpu
-class VersatileDiffusionImageVariationPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_image_variations(self):
-        pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        image_prompt = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-        generator = torch.manual_seed(0)
-        image = pipe(
-            image=image_prompt,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=50,
-            output_type="numpy",
-        ).images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.0441, 0.0469, 0.0507, 0.0575, 0.0632, 0.0650, 0.0865, 0.0909, 0.0945])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
deleted file mode 100644
index 585f4f023bc7..000000000000
--- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import VersatileDiffusionPipeline
-from diffusers.utils.testing_utils import load_image, nightly, require_torch_gpu, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class VersatileDiffusionMegaPipelineFastTests(unittest.TestCase):
-    pass
-
-
-@nightly
-@require_torch_gpu
-class VersatileDiffusionMegaPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_from_save_pretrained(self):
-        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-
-        generator = torch.manual_seed(0)
-        image = pipe.dual_guided(
-            prompt="first prompt",
-            image=prompt_image,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        ).images
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = VersatileDiffusionPipeline.from_pretrained(tmpdirname, torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = generator.manual_seed(0)
-        new_image = pipe.dual_guided(
-            prompt="first prompt",
-            image=prompt_image,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        ).images
-
-        assert np.abs(image - new_image).max() < 1e-5, "Models don't have the same forward pass"
-
-    def test_inference_dual_guided_then_text_to_image(self):
-        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "cyberpunk 2077"
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-        generator = torch.manual_seed(0)
-        image = pipe.dual_guided(
-            prompt=prompt,
-            image=init_image,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=50,
-            output_type="numpy",
-        ).images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.1448, 0.1619, 0.1741, 0.1086, 0.1147, 0.1128, 0.1199, 0.1165, 0.1001])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
-
-        prompt = "A painting of a squirrel eating a burger "
-        generator = torch.manual_seed(0)
-        image = pipe.text_to_image(
-            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="numpy"
-        ).images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.3367, 0.3169, 0.2656, 0.3870, 0.4790, 0.3796, 0.4009, 0.4878, 0.4778])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
-
-        image = pipe.image_variation(init_image, generator=generator, output_type="numpy").images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.3076, 0.3123, 0.3284, 0.3782, 0.3770, 0.3894, 0.4297, 0.4331, 0.4456])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
diff --git a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
deleted file mode 100644
index e17770778418..000000000000
--- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import VersatileDiffusionTextToImagePipeline
-from diffusers.utils.testing_utils import nightly, require_torch_gpu, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class VersatileDiffusionTextToImagePipelineFastTests(unittest.TestCase):
-    pass
-
-
-@nightly
-@require_torch_gpu
-class VersatileDiffusionTextToImagePipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_remove_unused_weights_save_load(self):
-        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
-        # remove text_unet
-        pipe.remove_unused_weights()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger "
-        generator = torch.manual_seed(0)
-        image = pipe(
-            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=2, output_type="numpy"
-        ).images
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(tmpdirname)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = generator.manual_seed(0)
-        new_image = pipe(
-            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=2, output_type="numpy"
-        ).images
-
-        assert np.abs(image - new_image).max() < 1e-5, "Models don't have the same forward pass"
-
-    def test_inference_text2img(self):
-        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
-            "shi-labs/versatile-diffusion", torch_dtype=torch.float16
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger "
-        generator = torch.manual_seed(0)
-        image = pipe(
-            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="numpy"
-        ).images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.3367, 0.3169, 0.2656, 0.3870, 0.4790, 0.3796, 0.4009, 0.4878, 0.4778])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/vq_diffusion/__init__.py b/tests/pipelines/vq_diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/pipelines/vq_diffusion/test_vq_diffusion.py b/tests/pipelines/vq_diffusion/test_vq_diffusion.py
deleted file mode 100644
index 88e9f19df709..000000000000
--- a/tests/pipelines/vq_diffusion/test_vq_diffusion.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import Transformer2DModel, VQDiffusionPipeline, VQDiffusionScheduler, VQModel
-from diffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings
-from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class VQDiffusionPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @property
-    def num_embed(self):
-        return 12
-
-    @property
-    def num_embeds_ada_norm(self):
-        return 12
-
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def dummy_vqvae(self):
-        torch.manual_seed(0)
-        model = VQModel(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3,
-            num_vq_embeddings=self.num_embed,
-            vq_embed_dim=3,
-        )
-        return model
-
-    @property
-    def dummy_tokenizer(self):
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        return tokenizer
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=self.text_embedder_hidden_size,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config)
-
-    @property
-    def dummy_transformer(self):
-        torch.manual_seed(0)
-
-        height = 12
-        width = 12
-
-        model_kwargs = {
-            "attention_bias": True,
-            "cross_attention_dim": 32,
-            "attention_head_dim": height * width,
-            "num_attention_heads": 1,
-            "num_vector_embeds": self.num_embed,
-            "num_embeds_ada_norm": self.num_embeds_ada_norm,
-            "norm_num_groups": 32,
-            "sample_size": width,
-            "activation_fn": "geglu-approximate",
-        }
-
-        model = Transformer2DModel(**model_kwargs)
-        return model
-
-    def test_vq_diffusion(self):
-        device = "cpu"
-
-        vqvae = self.dummy_vqvae
-        text_encoder = self.dummy_text_encoder
-        tokenizer = self.dummy_tokenizer
-        transformer = self.dummy_transformer
-        scheduler = VQDiffusionScheduler(self.num_embed)
-        learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(learnable=False)
-
-        pipe = VQDiffusionPipeline(
-            vqvae=vqvae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-            learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
-        )
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "teddy bear playing in the pool"
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np")
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = pipe(
-            [prompt], generator=generator, output_type="np", return_dict=False, num_inference_steps=2
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 24, 24, 3)
-
-        expected_slice = np.array([0.6551, 0.6168, 0.5008, 0.5676, 0.5659, 0.4295, 0.6073, 0.5599, 0.4992])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_vq_diffusion_classifier_free_sampling(self):
-        device = "cpu"
-
-        vqvae = self.dummy_vqvae
-        text_encoder = self.dummy_text_encoder
-        tokenizer = self.dummy_tokenizer
-        transformer = self.dummy_transformer
-        scheduler = VQDiffusionScheduler(self.num_embed)
-        learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(
-            learnable=True, hidden_size=self.text_embedder_hidden_size, length=tokenizer.model_max_length
-        )
-
-        pipe = VQDiffusionPipeline(
-            vqvae=vqvae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-            learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
-        )
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "teddy bear playing in the pool"
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np")
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = pipe(
-            [prompt], generator=generator, output_type="np", return_dict=False, num_inference_steps=2
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 24, 24, 3)
-
-        expected_slice = np.array([0.6693, 0.6075, 0.4959, 0.5701, 0.5583, 0.4333, 0.6171, 0.5684, 0.4988])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 2.0
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-
-@nightly
-@require_torch_gpu
-class VQDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_vq_diffusion_classifier_free_sampling(self):
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/vq_diffusion/teddy_bear_pool_classifier_free_sampling.npy"
-        )
-
-        pipeline = VQDiffusionPipeline.from_pretrained("microsoft/vq-diffusion-ithq")
-        pipeline = pipeline.to(torch_device)
-        pipeline.set_progress_bar_config(disable=None)
-
-        # requires GPU generator for gumbel softmax
-        # don't use GPU generator in tests though
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        output = pipeline(
-            "teddy bear playing in the pool",
-            num_images_per_prompt=1,
-            generator=generator,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (256, 256, 3)
-        assert np.abs(expected_image - image).max() < 2.0

From 4e770568858fc7f0a504d98b6569efd85ec06dec Mon Sep 17 00:00:00 2001
From: TilmannR <TilmannR@users.noreply.github.com>
Date: Mon, 18 Dec 2023 19:08:29 +0100
Subject: [PATCH 25/30] Update README.md (#6191)

Typo: The script for LoRA training is `train_text_to_image_lora_prior.py` not `train_text_to_image_prior_lora.py`.

Alternatively you could rename the file and keep the README.md unchanged.
---
 examples/wuerstchen/text_to_image/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/wuerstchen/text_to_image/README.md b/examples/wuerstchen/text_to_image/README.md
index 8b2040b4ca7f..d655259088e4 100644
--- a/examples/wuerstchen/text_to_image/README.md
+++ b/examples/wuerstchen/text_to_image/README.md
@@ -77,7 +77,7 @@ First, you need to set up your development environment as explained in the [inst
 ```bash
 export DATASET_NAME="lambdalabs/pokemon-blip-captions"
 
-accelerate launch train_text_to_image_prior_lora.py \
+accelerate launch train_text_to_image_lora_prior.py \
   --mixed_precision="fp16" \
   --dataset_name=$DATASET_NAME --caption_column="text" \
   --resolution=768 \

From 67b3d3267e407ca6d9d5c41dc37f0f7d3ae29116 Mon Sep 17 00:00:00 2001
From: Aryan V S <contact.aryanvs@gmail.com>
Date: Mon, 18 Dec 2023 23:49:11 +0530
Subject: [PATCH 26/30] Support img2img and inpaint in lpw-xl (#6114)

* add img2img and inpaint support to lpw-xl

* update community README

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 examples/community/README.md                  |  46 +-
 examples/community/lpw_stable_diffusion_xl.py | 782 +++++++++++++++++-
 2 files changed, 785 insertions(+), 43 deletions(-)

diff --git a/examples/community/README.md b/examples/community/README.md
index c8865adf78f7..7af6d1d7eb02 100755
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -41,7 +41,7 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | TensorRT Stable Diffusion Inpainting Pipeline                                                                                                    | Accelerates the Stable Diffusion Inpainting Pipeline using TensorRT                                                                                                                                                                                                                                                                                                                                                                                                                                      | [TensorRT Stable Diffusion Inpainting Pipeline](#tensorrt-inpainting-stable-diffusion-pipeline)      | - |              [Asfiya Baig](https://github.com/asfiyab-nvidia) |
 |   IADB Pipeline                                                                                                    | Implementation of [Iterative α-(de)Blending: a Minimalist Deterministic Diffusion Model](https://arxiv.org/abs/2305.03486)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [IADB Pipeline](#iadb-pipeline)      | - |              [Thomas Chambon](https://github.com/tchambon)
 |   Zero1to3 Pipeline                                                                                                    | Implementation of [Zero-1-to-3: Zero-shot One Image to 3D Object](https://arxiv.org/abs/2303.11328)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Zero1to3 Pipeline](#Zero1to3-pipeline)      | - |              [Xin Kong](https://github.com/kxhit) |
-Stable Diffusion XL Long Weighted Prompt Pipeline | A pipeline support unlimited length of prompt and negative prompt, use A1111 style of prompt weighting | [Stable Diffusion XL Long Weighted Prompt Pipeline](#stable-diffusion-xl-long-weighted-prompt-pipeline) | - | [Andrew Zhu](https://xhinker.medium.com/) |
+| Stable Diffusion XL Long Weighted Prompt Pipeline | A pipeline support unlimited length of prompt and negative prompt, use A1111 style of prompt weighting | [Stable Diffusion XL Long Weighted Prompt Pipeline](#stable-diffusion-xl-long-weighted-prompt-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1LsqilswLR40XLLcp6XFOl5nKb_wOe26W?usp=sharing) | [Andrew Zhu](https://xhinker.medium.com/) |
 FABRIC - Stable Diffusion with feedback Pipeline | pipeline supports feedback from liked and disliked images | [Stable Diffusion Fabric Pipeline](#stable-diffusion-fabric-pipeline) | - | [Shauray Singh](https://shauray8.github.io/about_shauray/) |
 sketch inpaint - Inpainting with non-inpaint Stable Diffusion | sketch inpaint much like in automatic1111 | [Masked Im2Im Stable Diffusion Pipeline](#stable-diffusion-masked-im2im) | - | [Anatoly Belikov](https://github.com/noskill) |
 prompt-to-prompt | change parts of a prompt and retain image structure (see [paper page](https://prompt-to-prompt.github.io/)) | [Prompt2Prompt Pipeline](#prompt2prompt-pipeline) | - | [Umer H. Adil](https://twitter.com/UmerHAdil) |
@@ -1619,10 +1619,11 @@ This approach is using (optional) CoCa model to avoid writing image description.
 
 This SDXL pipeline support unlimited length prompt and negative prompt, compatible with A1111 prompt weighted style.
 
-You can provide both `prompt` and `prompt_2`. if only one prompt is provided, `prompt_2` will be a copy of the provided `prompt`. Here is a sample code to use this pipeline.
+You can provide both `prompt` and `prompt_2`. If only one prompt is provided, `prompt_2` will be a copy of the provided `prompt`. Here is a sample code to use this pipeline. 
 
 ```python
 from diffusers import DiffusionPipeline
+from diffusers.utils import load_image
 import torch
 
 pipe = DiffusionPipeline.from_pretrained(
@@ -1633,25 +1634,52 @@ pipe = DiffusionPipeline.from_pretrained(
     , custom_pipeline   = "lpw_stable_diffusion_xl",
 )
 
-prompt = "photo of a cute (white) cat running on the grass"*20
-prompt2 = "chasing (birds:1.5)"*20
+prompt = "photo of a cute (white) cat running on the grass" * 20
+prompt2 = "chasing (birds:1.5)" * 20
 prompt = f"{prompt},{prompt2}"
 neg_prompt = "blur, low quality, carton, animate"
 
 pipe.to("cuda")
-images = pipe(
-    prompt                  = prompt
-    , negative_prompt       = neg_prompt
-).images[0]
+
+# text2img
+t2i_images = pipe(
+    prompt=prompt,
+    negative_prompt=neg_prompt,
+).images # alternatively, you can call the .text2img() function
+
+# img2img
+input_image = load_image("/path/to/local/image.png") # or URL to your input image
+i2i_images = pipe.img2img(
+  prompt=prompt,
+  negative_prompt=neg_prompt,
+  image=input_image,
+  strength=0.8, # higher strength will result in more variation compared to original image
+).images
+
+# inpaint
+input_mask = load_image("/path/to/local/mask.png") # or URL to your input inpainting mask
+inpaint_images = pipe.inpaint(
+  prompt="photo of a cute (black) cat running on the grass" * 20,
+  negative_prompt=neg_prompt,
+  image=input_image,
+  mask=input_mask,
+  strength=0.6, # higher strength will result in more variation compared to original image
+).images
 
 pipe.to("cpu")
 torch.cuda.empty_cache()
-images
+
+from IPython.display import display # assuming you are using this code in a notebook
+display(t2i_images[0])
+display(i2i_images[0])
+display(inpaint_images[0])
 ```
 
 In the above code, the `prompt2` is appended to the `prompt`, which is more than 77 tokens. "birds" are showing up in the result.
 ![Stable Diffusion XL Long Weighted Prompt Pipeline sample](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_long_weighted_prompt.png)
 
+For more results, checkout [PR #6114](https://github.com/huggingface/diffusers/pull/6114).
+
 ## Example Images Mixing (with CoCa)
 ```python
 import requests
diff --git a/examples/community/lpw_stable_diffusion_xl.py b/examples/community/lpw_stable_diffusion_xl.py
index dfe60d9794e1..e913048f7ae4 100644
--- a/examples/community/lpw_stable_diffusion_xl.py
+++ b/examples/community/lpw_stable_diffusion_xl.py
@@ -11,10 +11,11 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
+from PIL import Image
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
 
 from diffusers import DiffusionPipeline, StableDiffusionXLPipeline
-from diffusers.image_processor import VaeImageProcessor
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.attention_processor import (
@@ -23,7 +24,7 @@
     LoRAXFormersAttnProcessor,
     XFormersAttnProcessor,
 )
-from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
     is_accelerate_available,
@@ -461,6 +462,65 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
     return noise_cfg
 
 
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
 class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -526,6 +586,9 @@ def __init__(
         self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
         self.default_sample_size = self.unet.config.sample_size
 
         add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
@@ -813,6 +876,7 @@ def check_inputs(
         prompt_2,
         height,
         width,
+        strength,
         callback_steps,
         negative_prompt=None,
         negative_prompt_2=None,
@@ -824,6 +888,9 @@ def check_inputs(
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
         if (callback_steps is None) or (
             callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
         ):
@@ -880,23 +947,263 @@ def check_inputs(
                 "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
             )
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+        else:
+            t_start = 0
+
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        # Strength is irrelevant if we directly request a timestep to start at;
+        # that is, strength is determined by the denoising_start instead.
+        if denoising_start is not None:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
             )
 
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            timesteps = timesteps[-num_inference_steps:]
+            return timesteps, num_inference_steps
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(
+        self,
+        image,
+        mask,
+        width,
+        height,
+        num_channels_latents,
+        timestep,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        device,
+        generator=None,
+        add_noise=True,
+        latents=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        batch_size *= num_images_per_prompt
+
+        if image is None:
+            shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if latents is None:
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            else:
+                latents = latents.to(device)
+
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+            return latents
+
+        elif mask is None:
+            if not isinstance(image, (torch.Tensor, Image.Image, list)):
+                raise ValueError(
+                    f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+                )
+
+            # Offload text encoder if `enable_model_cpu_offload` was enabled
+            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                self.text_encoder_2.to("cpu")
+                torch.cuda.empty_cache()
+
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 4:
+                init_latents = image
+
+            else:
+                # make sure the VAE is in float32 mode, as it overflows in float16
+                if self.vae.config.force_upcast:
+                    image = image.float()
+                    self.vae.to(dtype=torch.float32)
+
+                if isinstance(generator, list) and len(generator) != batch_size:
+                    raise ValueError(
+                        f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                        f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                    )
+
+                elif isinstance(generator, list):
+                    init_latents = [
+                        retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                        for i in range(batch_size)
+                    ]
+                    init_latents = torch.cat(init_latents, dim=0)
+                else:
+                    init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+                if self.vae.config.force_upcast:
+                    self.vae.to(dtype)
+
+                init_latents = init_latents.to(dtype)
+                init_latents = self.vae.config.scaling_factor * init_latents
+
+            if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                additional_image_per_prompt = batch_size // init_latents.shape[0]
+                init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+            elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                init_latents = torch.cat([init_latents], dim=0)
+
+            if add_noise:
+                shape = init_latents.shape
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                # get latents
+                init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+            latents = init_latents
+            return latents
+
         else:
-            latents = latents.to(device)
+            shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if (image is None or timestep is None) and not is_strength_max:
+                raise ValueError(
+                    "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                    "However, either the image or the noise timestep has not been provided."
+                )
+
+            if image.shape[1] == 4:
+                image_latents = image.to(device=device, dtype=dtype)
+                image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+            elif return_image_latents or (latents is None and not is_strength_max):
+                image = image.to(device=device, dtype=dtype)
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+                image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+            if latents is None and add_noise:
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                # if strength is 1. then initialise the latents to noise, else initial to image + noise
+                latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+                # if pure noise then scale the initial latents by the  Scheduler's init sigma
+                latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+            elif add_noise:
+                noise = latents.to(device)
+                latents = noise * self.scheduler.init_noise_sigma
+            else:
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                latents = image_latents.to(device)
+
+            outputs = (latents,)
+
+            if return_noise:
+                outputs += (noise,)
+
+            if return_image_latents:
+                outputs += (image_latents,)
+
+            return outputs
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        dtype = image.dtype
+        if self.vae.config.force_upcast:
+            image = image.float()
+            self.vae.to(dtype=torch.float32)
+
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        if self.vae.config.force_upcast:
+            self.vae.to(dtype)
+
+        image_latents = image_latents.to(dtype)
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+
+        if masked_image is not None and masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = None
+
+        if masked_image is not None:
+            if masked_image_latents is None:
+                masked_image = masked_image.to(device=device, dtype=dtype)
+                masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+            if masked_image_latents.shape[0] < batch_size:
+                if not batch_size % masked_image_latents.shape[0] == 0:
+                    raise ValueError(
+                        "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                        f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                        " Make sure the number of images that you pass is divisible by the total requested batch size."
+                    )
+                masked_image_latents = masked_image_latents.repeat(
+                    batch_size // masked_image_latents.shape[0], 1, 1, 1
+                )
+
+            masked_image_latents = (
+                torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+            )
+
+            # aligning device to prevent device errors when concating it with the latent model input
+            masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
 
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
+        return mask, masked_image_latents
 
     def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
         add_time_ids = list(original_size + crops_coords_top_left + target_size)
@@ -934,15 +1241,52 @@ def upcast_vae(self):
             self.vae.decoder.conv_in.to(dtype)
             self.vae.decoder.mid_block.to(dtype)
 
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def denoising_start(self):
+        return self._denoising_start
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: str = None,
         prompt_2: Optional[str] = None,
+        image: Optional[PipelineImageInput] = None,
+        mask_image: Optional[PipelineImageInput] = None,
+        masked_image_latents: Optional[torch.FloatTensor] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
+        strength: float = 0.8,
         num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
         denoising_end: Optional[float] = None,
         guidance_scale: float = 5.0,
         negative_prompt: Optional[str] = None,
@@ -975,20 +1319,46 @@ def __call__(
             prompt_2 (`str`):
                 The prompt to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                 used in both text-encoders
+            image (`PipelineImageInput`, *optional*):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`PipelineImageInput`, *optional*):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refine Image
+                Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality).
             denoising_end (`float`, *optional*):
                 When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
                 completed before it is intentionally prematurely terminated. As a result, the returned sample will
-                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
-                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
-                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
-                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
+                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
+                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
+                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refine Image
+                Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality).
             guidance_scale (`float`, *optional*, defaults to 5.0):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -1084,6 +1454,7 @@ def __call__(
             prompt_2,
             height,
             width,
+            strength,
             callback_steps,
             negative_prompt,
             negative_prompt_2,
@@ -1093,6 +1464,12 @@ def __call__(
             negative_pooled_prompt_embeds,
         )
 
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
@@ -1121,28 +1498,126 @@ def __call__(
         ) = get_weighted_text_embeddings_sdxl(
             pipe=self, prompt=prompt, neg_prompt=negative_prompt, num_images_per_prompt=num_images_per_prompt
         )
+        dtype = prompt_embeds.dtype
+
+        if isinstance(image, Image.Image):
+            image = self.image_processor.preprocess(image, height=height, width=width)
+        if image is not None:
+            image = image.to(device=self.device, dtype=dtype)
+
+        if isinstance(mask_image, Image.Image):
+            mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
+        else:
+            mask = mask_image
+        if mask_image is not None:
+            mask = mask.to(device=self.device, dtype=dtype)
+
+            if masked_image_latents is not None:
+                masked_image = masked_image_latents
+            elif image.shape[1] == 4:
+                # if image is in latent space, we can't mask it
+                masked_image = None
+            else:
+                masked_image = image * (mask < 0.5)
+        else:
+            mask = None
 
         # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        def denoising_value_valid(dnv):
+            return isinstance(self.denoising_end, float) and 0 < dnv < 1
+
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        if image is not None:
+            timesteps, num_inference_steps = self.get_timesteps(
+                num_inference_steps,
+                strength,
+                device,
+                denoising_start=self.denoising_start if denoising_value_valid else None,
+            )
+
+            # check that number of inference steps is not < 1 - as this doesn't make sense
+            if num_inference_steps < 1:
+                raise ValueError(
+                    f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                    f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+                )
 
-        timesteps = self.scheduler.timesteps
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        is_strength_max = strength == 1.0
+        add_noise = True if self.denoising_start is None else False
 
         # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
         latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
+            image=image,
+            mask=mask,
+            width=width,
+            height=height,
+            num_channels_latents=num_channels_unet,
+            timestep=latent_timestep,
+            batch_size=batch_size,
+            num_images_per_prompt=num_images_per_prompt,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            add_noise=add_noise,
+            latents=latents,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
         )
 
+        if mask is not None:
+            if return_image_latents:
+                latents, noise, image_latents = latents
+            else:
+                latents, noise = latents
+
+        # 5.1. Prepare mask latent variables
+        if mask is not None:
+            mask, masked_image_latents = self.prepare_mask_latents(
+                mask=mask,
+                masked_image=masked_image,
+                batch_size=batch_size * num_images_per_prompt,
+                height=height,
+                width=width,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+
+            # 8. Check that sizes of mask, masked image and latents match
+            if num_channels_unet == 9:
+                # default case for runwayml/stable-diffusion-inpainting
+                num_channels_mask = mask.shape[1]
+                num_channels_masked_image = masked_image_latents.shape[1]
+                if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet:
+                    raise ValueError(
+                        f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                        f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                        f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                        f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                        " `pipeline.unet` or your `mask_image` or `image` input."
+                    )
+            elif num_channels_unet != 4:
+                raise ValueError(
+                    f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+                )
+
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
         # 7. Prepare added time ids & embeddings
         add_text_embeds = pooled_prompt_embeds
         add_time_ids = self._get_add_time_ids(
@@ -1158,20 +1633,41 @@ def __call__(
         add_text_embeds = add_text_embeds.to(device)
         add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
 
-        # 8. Denoising loop
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
 
         # 7.1 Apply denoising_end
-        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+        if (
+            self.denoising_end is not None
+            and self.denoising_start is not None
+            and denoising_value_valid(self.denoising_end)
+            and denoising_value_valid(self.denoising_start)
+            and self.denoising_start >= self.denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {self.denoising_end} when using type float."
+            )
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
             discrete_timestep_cutoff = int(
                 round(
                     self.scheduler.config.num_train_timesteps
-                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
                 )
             )
             num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
             timesteps = timesteps[:num_inference_steps]
 
+        # 8. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+
+        # 9. Denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
@@ -1179,13 +1675,17 @@ def __call__(
 
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
+                if mask is not None and num_channels_unet == 9:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
                 # predict the noise residual
                 added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
                     added_cond_kwargs=added_cond_kwargs,
                     return_dict=False,
                 )[0]
@@ -1202,6 +1702,22 @@ def __call__(
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
 
+                if mask is not None and num_channels_unet == 4:
+                    init_latents_proper = image_latents
+
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
@@ -1241,6 +1757,204 @@ def __call__(
 
         return StableDiffusionXLPipelineOutput(images=image)
 
+    def text2img(
+        self,
+        prompt: str = None,
+        prompt_2: Optional[str] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+    ):
+        return self.__call__(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            denoising_start=denoising_start,
+            denoising_end=denoising_end,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
+            guidance_rescale=guidance_rescale,
+            original_size=original_size,
+            crops_coords_top_left=crops_coords_top_left,
+            target_size=target_size,
+        )
+
+    def img2img(
+        self,
+        prompt: str = None,
+        prompt_2: Optional[str] = None,
+        image: Optional[PipelineImageInput] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+    ):
+        return self.__call__(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            image=image,
+            height=height,
+            width=width,
+            strength=strength,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            denoising_start=denoising_start,
+            denoising_end=denoising_end,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
+            guidance_rescale=guidance_rescale,
+            original_size=original_size,
+            crops_coords_top_left=crops_coords_top_left,
+            target_size=target_size,
+        )
+
+    def inpaint(
+        self,
+        prompt: str = None,
+        prompt_2: Optional[str] = None,
+        image: Optional[PipelineImageInput] = None,
+        mask_image: Optional[PipelineImageInput] = None,
+        masked_image_latents: Optional[torch.FloatTensor] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+    ):
+        return self.__call__(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            image=image,
+            mask_image=mask_image,
+            masked_image_latents=masked_image_latents,
+            height=height,
+            width=width,
+            strength=strength,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            denoising_start=denoising_start,
+            denoising_end=denoising_end,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
+            guidance_rescale=guidance_rescale,
+            original_size=original_size,
+            crops_coords_top_left=crops_coords_top_left,
+            target_size=target_size,
+        )
+
     # Overrride to properly handle the loading and unloading of the additional text encoder.
     def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
         # We could have accessed the unet config from `lora_state_dict()` too. We pass

From 7d0a47f387e7c76ffa4fee5e7365228cef25801d Mon Sep 17 00:00:00 2001
From: Haofan Wang <haofanwang.ai@gmail.com>
Date: Tue, 19 Dec 2023 02:33:05 +0800
Subject: [PATCH 27/30] Update train_text_to_image_lora.py (#6144)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update train_text_to_image_lora.py

* Fix typo?

---------

Co-authored-by: M. Tolga Cangöz <46008593+standardAI@users.noreply.github.com>
Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 examples/text_to_image/train_text_to_image_lora.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py
index 0af2c1b2a5b4..ca699c863eb6 100644
--- a/examples/text_to_image/train_text_to_image_lora.py
+++ b/examples/text_to_image/train_text_to_image_lora.py
@@ -799,7 +799,8 @@ def collate_fn(examples):
                         save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                         accelerator.save_state(save_path)
 
-                        unet_lora_state_dict = get_peft_model_state_dict(unet)
+                        unwrapped_unet = accelerator.unwrap_model(unet)
+                        unet_lora_state_dict = get_peft_model_state_dict(unwrapped_unet)
 
                         StableDiffusionPipeline.save_lora_weights(
                             save_directory=save_path,
@@ -864,7 +865,8 @@ def collate_fn(examples):
     if accelerator.is_main_process:
         unet = unet.to(torch.float32)
 
-        unet_lora_state_dict = get_peft_model_state_dict(unet)
+        unwrapped_unet = accelerator.unwrap_model(unet)
+        unet_lora_state_dict = get_peft_model_state_dict(unwrapped_unet)
         StableDiffusionPipeline.save_lora_weights(
             save_directory=args.output_dir,
             unet_lora_layers=unet_lora_state_dict,

From fa3c86beaf04e297d4b0e824692e3bd4edfb5f22 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 18 Dec 2023 19:33:24 +0100
Subject: [PATCH 28/30] [SVD] Fix guidance scale (#6002)

* [SVD] Fix guidance scale

* make style
---
 .../pipeline_stable_video_diffusion.py           | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index 988623ca653e..8b4c7bdd0881 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -291,7 +291,9 @@ def guidance_scale(self):
     # corresponds to doing no classifier free guidance.
     @property
     def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+        if isinstance(self.guidance_scale, (int, float)):
+            return self.guidance_scale
+        return self.guidance_scale.max() > 1
 
     @property
     def num_timesteps(self):
@@ -416,10 +418,10 @@ def __call__(
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = max_guidance_scale > 1.0
+        self._guidance_scale = max_guidance_scale
 
         # 3. Encode input image
-        image_embeddings = self._encode_image(image, device, num_videos_per_prompt, do_classifier_free_guidance)
+        image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
 
         # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
         # is why it is reduced here.
@@ -435,7 +437,7 @@ def __call__(
         if needs_upcasting:
             self.vae.to(dtype=torch.float32)
 
-        image_latents = self._encode_vae_image(image, device, num_videos_per_prompt, do_classifier_free_guidance)
+        image_latents = self._encode_vae_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
         image_latents = image_latents.to(image_embeddings.dtype)
 
         # cast back to fp16 if needed
@@ -454,7 +456,7 @@ def __call__(
             image_embeddings.dtype,
             batch_size,
             num_videos_per_prompt,
-            do_classifier_free_guidance,
+            self.do_classifier_free_guidance,
         )
         added_time_ids = added_time_ids.to(device)
 
@@ -490,7 +492,7 @@ def __call__(
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # Concatenate image_latents over channels dimention
@@ -506,7 +508,7 @@ def __call__(
                 )[0]
 
                 # perform guidance
-                if do_classifier_free_guidance:
+                if self.do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
 

From 781775ea56160a6dea3d53fd5005d0d7fca5f10a Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 19 Dec 2023 00:45:51 +0530
Subject: [PATCH 29/30] Slow Test for Pipelines minor fixes (#6221)

update
---
 .../shap_e/pipeline_shap_e_img2img.py         |  6 ++--
 .../pipelines/unclip/pipeline_unclip.py       |  3 +-
 .../unclip/pipeline_unclip_image_variation.py |  1 +
 .../pipelines/animatediff/test_animatediff.py | 31 ++++++++++++++++++-
 .../test_stable_diffusion_adapter.py          |  3 +-
 .../test_stable_diffusion_xl_adapter.py       | 10 +++---
 6 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 046d20f47100..2a7c1ea6f57d 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -283,6 +283,9 @@ def __call__(
                 f"Only the output types `pil`, `np`, `latent` and `mesh` are supported not output_type={output_type}"
             )
 
+        # Offload all models
+        self.maybe_free_model_hooks()
+
         if output_type == "latent":
             return ShapEPipelineOutput(images=latents)
 
@@ -312,9 +315,6 @@ def __call__(
             if output_type == "pil":
                 images = [self.numpy_to_pil(image) for image in images]
 
-        # Offload all models
-        self.maybe_free_model_hooks()
-
         if not return_dict:
             return (images,)
 
diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip.py b/src/diffusers/pipelines/unclip/pipeline_unclip.py
index 7bebed73c106..ebfb11d8e55a 100644
--- a/src/diffusers/pipelines/unclip/pipeline_unclip.py
+++ b/src/diffusers/pipelines/unclip/pipeline_unclip.py
@@ -477,8 +477,9 @@ def __call__(
         image = super_res_latents
         # done super res
 
-        # post processing
+        self.maybe_free_model_hooks()
 
+        # post processing
         image = image * 0.5 + 0.5
         image = image.clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
index bdc4280041a2..bf6c5e4fa018 100644
--- a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
+++ b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
@@ -403,6 +403,7 @@ def __call__(
         image = super_res_latents
 
         # done super res
+        self.maybe_free_model_hooks()
 
         # post processing
 
diff --git a/tests/pipelines/animatediff/test_animatediff.py b/tests/pipelines/animatediff/test_animatediff.py
index 5cd0a45c7406..633ed9fc233e 100644
--- a/tests/pipelines/animatediff/test_animatediff.py
+++ b/tests/pipelines/animatediff/test_animatediff.py
@@ -14,7 +14,7 @@
     UNet2DConditionModel,
     UNetMotionModel,
 )
-from diffusers.utils import logging
+from diffusers.utils import is_xformers_available, logging
 from diffusers.utils.testing_utils import numpy_cosine_similarity_distance, require_torch_gpu, slow, torch_device
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -233,6 +233,35 @@ def test_prompt_embeds(self):
         inputs["prompt_embeds"] = torch.randn((1, 4, 32), device=torch_device)
         pipe(**inputs)
 
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = pipe(**inputs).frames[0]
+        output_without_offload = (
+            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
+        )
+
+        pipe.enable_xformers_memory_efficient_attention()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = pipe(**inputs).frames[0]
+        output_with_offload = (
+            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
+        )
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
+
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
index a5e8649f060f..f1b61c3364f0 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
@@ -804,8 +804,7 @@ def test_stable_diffusion_adapter_zoedepth_sd_v15(self):
         pipe = StableDiffusionAdapterPipeline.from_pretrained(sd_model, adapter=adapter, safety_checker=None)
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
+        pipe.enable_model_cpu_offload()
         generator = torch.Generator(device="cpu").manual_seed(0)
         out = pipe(prompt=prompt, image=image, generator=generator, num_inference_steps=2, output_type="np").images
 
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
index f63ee8be1dd0..d1920d59b447 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
@@ -681,7 +681,7 @@ def test_canny_lora(self):
             variant="fp16",
         )
         pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors")
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_model_cpu_offload()
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -694,8 +694,6 @@ def test_canny_lora(self):
 
         assert images[0].shape == (768, 512, 3)
 
-        original_image = images[0, -3:, -3:, -1].flatten()
-        expected_image = np.array(
-            [0.50346327, 0.50708383, 0.50719553, 0.5135172, 0.5155377, 0.5066059, 0.49680984, 0.5005894, 0.48509413]
-        )
-        assert numpy_cosine_similarity_distance(original_image, expected_image) < 1e-4
+        image_slice = images[0, -3:, -3:, -1].flatten()
+        expected_slice = np.array([0.4284, 0.4337, 0.4319, 0.4255, 0.4329, 0.4280, 0.4338, 0.4420, 0.4226])
+        assert numpy_cosine_similarity_distance(image_slice, expected_slice) < 1e-4

From 68e962395cfaf85ddc9c1076fed9fb1824174d3f Mon Sep 17 00:00:00 2001
From: Fabio Rigano <fabio2rigano@gmail.com>
Date: Tue, 19 Dec 2023 00:46:43 +0100
Subject: [PATCH 30/30] Add converter method for ip adapters (#6150)

* Add converter method for ip adapters

* Move converter method

* Update to image proj converter

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 src/diffusers/loaders/unet.py | 175 +++++++++++++++-------------------
 1 file changed, 77 insertions(+), 98 deletions(-)

diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
index 7309c3fc709c..7dec43571b1c 100644
--- a/src/diffusers/loaders/unet.py
+++ b/src/diffusers/loaders/unet.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from collections import OrderedDict, defaultdict
+from collections import defaultdict
 from contextlib import nullcontext
 from typing import Callable, Dict, List, Optional, Union
 
@@ -664,6 +664,80 @@ def delete_adapters(self, adapter_names: Union[List[str], str]):
             if hasattr(self, "peft_config"):
                 self.peft_config.pop(adapter_name, None)
 
+    def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict):
+        updated_state_dict = {}
+        image_projection = None
+
+        if "proj.weight" in state_dict:
+            # IP-Adapter
+            num_image_text_embeds = 4
+            clip_embeddings_dim = state_dict["proj.weight"].shape[-1]
+            cross_attention_dim = state_dict["proj.weight"].shape[0] // 4
+
+            image_projection = ImageProjection(
+                cross_attention_dim=cross_attention_dim,
+                image_embed_dim=clip_embeddings_dim,
+                num_image_text_embeds=num_image_text_embeds,
+            )
+
+            for key, value in state_dict.items():
+                diffusers_name = key.replace("proj", "image_embeds")
+                updated_state_dict[diffusers_name] = value
+
+        elif "proj.3.weight" in state_dict:
+            # IP-Adapter Full
+            clip_embeddings_dim = state_dict["proj.0.weight"].shape[0]
+            cross_attention_dim = state_dict["proj.3.weight"].shape[0]
+
+            image_projection = MLPProjection(
+                cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim
+            )
+
+            for key, value in state_dict.items():
+                diffusers_name = key.replace("proj.0", "ff.net.0.proj")
+                diffusers_name = diffusers_name.replace("proj.2", "ff.net.2")
+                diffusers_name = diffusers_name.replace("proj.3", "norm")
+                updated_state_dict[diffusers_name] = value
+
+        else:
+            # IP-Adapter Plus
+            num_image_text_embeds = state_dict["latents"].shape[1]
+            embed_dims = state_dict["proj_in.weight"].shape[1]
+            output_dims = state_dict["proj_out.weight"].shape[0]
+            hidden_dims = state_dict["latents"].shape[2]
+            heads = state_dict["layers.0.0.to_q.weight"].shape[0] // 64
+
+            image_projection = Resampler(
+                embed_dims=embed_dims,
+                output_dims=output_dims,
+                hidden_dims=hidden_dims,
+                heads=heads,
+                num_queries=num_image_text_embeds,
+            )
+
+            for key, value in state_dict.items():
+                diffusers_name = key.replace("0.to", "2.to")
+                diffusers_name = diffusers_name.replace("1.0.weight", "3.0.weight")
+                diffusers_name = diffusers_name.replace("1.0.bias", "3.0.bias")
+                diffusers_name = diffusers_name.replace("1.1.weight", "3.1.net.0.proj.weight")
+                diffusers_name = diffusers_name.replace("1.3.weight", "3.1.net.2.weight")
+
+                if "norm1" in diffusers_name:
+                    updated_state_dict[diffusers_name.replace("0.norm1", "0")] = value
+                elif "norm2" in diffusers_name:
+                    updated_state_dict[diffusers_name.replace("0.norm2", "1")] = value
+                elif "to_kv" in diffusers_name:
+                    v_chunk = value.chunk(2, dim=0)
+                    updated_state_dict[diffusers_name.replace("to_kv", "to_k")] = v_chunk[0]
+                    updated_state_dict[diffusers_name.replace("to_kv", "to_v")] = v_chunk[1]
+                elif "to_out" in diffusers_name:
+                    updated_state_dict[diffusers_name.replace("to_out", "to_out.0")] = value
+                else:
+                    updated_state_dict[diffusers_name] = value
+
+        image_projection.load_state_dict(updated_state_dict)
+        return image_projection
+
     def _load_ip_adapter_weights(self, state_dict):
         from ..models.attention_processor import (
             AttnProcessor,
@@ -724,103 +798,8 @@ def _load_ip_adapter_weights(self, state_dict):
 
         self.set_attn_processor(attn_procs)
 
-        # create image projection layers.
-        if "proj.weight" in state_dict["image_proj"]:
-            # IP-Adapter
-            clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
-            cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
-
-            image_projection = ImageProjection(
-                cross_attention_dim=cross_attention_dim,
-                image_embed_dim=clip_embeddings_dim,
-                num_image_text_embeds=num_image_text_embeds,
-            )
-            image_projection.to(dtype=self.dtype, device=self.device)
-
-            # load image projection layer weights
-            image_proj_state_dict = {}
-            image_proj_state_dict.update(
-                {
-                    "image_embeds.weight": state_dict["image_proj"]["proj.weight"],
-                    "image_embeds.bias": state_dict["image_proj"]["proj.bias"],
-                    "norm.weight": state_dict["image_proj"]["norm.weight"],
-                    "norm.bias": state_dict["image_proj"]["norm.bias"],
-                }
-            )
-            image_projection.load_state_dict(image_proj_state_dict)
-            del image_proj_state_dict
-
-        elif "proj.3.weight" in state_dict["image_proj"]:
-            clip_embeddings_dim = state_dict["image_proj"]["proj.0.weight"].shape[0]
-            cross_attention_dim = state_dict["image_proj"]["proj.3.weight"].shape[0]
-
-            image_projection = MLPProjection(
-                cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim
-            )
-            image_projection.to(dtype=self.dtype, device=self.device)
-
-            # load image projection layer weights
-            image_proj_state_dict = {}
-            image_proj_state_dict.update(
-                {
-                    "ff.net.0.proj.weight": state_dict["image_proj"]["proj.0.weight"],
-                    "ff.net.0.proj.bias": state_dict["image_proj"]["proj.0.bias"],
-                    "ff.net.2.weight": state_dict["image_proj"]["proj.2.weight"],
-                    "ff.net.2.bias": state_dict["image_proj"]["proj.2.bias"],
-                    "norm.weight": state_dict["image_proj"]["proj.3.weight"],
-                    "norm.bias": state_dict["image_proj"]["proj.3.bias"],
-                }
-            )
-            image_projection.load_state_dict(image_proj_state_dict)
-            del image_proj_state_dict
-
-        else:
-            # IP-Adapter Plus
-            embed_dims = state_dict["image_proj"]["proj_in.weight"].shape[1]
-            output_dims = state_dict["image_proj"]["proj_out.weight"].shape[0]
-            hidden_dims = state_dict["image_proj"]["latents"].shape[2]
-            heads = state_dict["image_proj"]["layers.0.0.to_q.weight"].shape[0] // 64
-
-            image_projection = Resampler(
-                embed_dims=embed_dims,
-                output_dims=output_dims,
-                hidden_dims=hidden_dims,
-                heads=heads,
-                num_queries=num_image_text_embeds,
-            )
-
-            image_proj_state_dict = state_dict["image_proj"]
-
-            new_sd = OrderedDict()
-            for k, v in image_proj_state_dict.items():
-                if "0.to" in k:
-                    k = k.replace("0.to", "2.to")
-                elif "1.0.weight" in k:
-                    k = k.replace("1.0.weight", "3.0.weight")
-                elif "1.0.bias" in k:
-                    k = k.replace("1.0.bias", "3.0.bias")
-                elif "1.1.weight" in k:
-                    k = k.replace("1.1.weight", "3.1.net.0.proj.weight")
-                elif "1.3.weight" in k:
-                    k = k.replace("1.3.weight", "3.1.net.2.weight")
-
-                if "norm1" in k:
-                    new_sd[k.replace("0.norm1", "0")] = v
-                elif "norm2" in k:
-                    new_sd[k.replace("0.norm2", "1")] = v
-                elif "to_kv" in k:
-                    v_chunk = v.chunk(2, dim=0)
-                    new_sd[k.replace("to_kv", "to_k")] = v_chunk[0]
-                    new_sd[k.replace("to_kv", "to_v")] = v_chunk[1]
-                elif "to_out" in k:
-                    new_sd[k.replace("to_out", "to_out.0")] = v
-                else:
-                    new_sd[k] = v
-
-            image_projection.load_state_dict(new_sd)
-            del image_proj_state_dict
+        # convert IP-Adapter Image Projection layers to diffusers
+        image_projection = self._convert_ip_adapter_image_proj_to_diffusers(state_dict["image_proj"])
 
         self.encoder_hid_proj = image_projection.to(device=self.device, dtype=self.dtype)
         self.config.encoder_hid_dim_type = "ip_image_proj"
-
-    delete_adapter_layers