diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml
index 2f73c66de829..2e9ac33d6b00 100644
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -19,7 +19,7 @@ env:
 jobs:
   setup_torch_cuda_pipeline_matrix:
     name: Setup Torch Pipelines Matrix
-    runs-on: ubuntu-latest
+    runs-on: diffusers/diffusers-pytorch-cpu
     outputs:
       pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
     steps:
@@ -67,19 +67,19 @@ jobs:
           fetch-depth: 2
       - name: NVIDIA-SMI
         run: nvidia-smi
-      
+
       - name: Install dependencies
         run: |
           python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
           python -m uv pip install -e [quality,test]
           python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
           python -m uv pip install pytest-reportlog
-      
+
       - name: Environment
         run: |
           python utils/print_env.py
-      
-      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests 
+
+      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests
         env:
           HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -88,9 +88,9 @@ jobs:
           python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
             -s -v -k "not Flax and not Onnx" \
             --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
-            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \ 
+            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
             tests/pipelines/${{ matrix.module }}
-      
+
       - name: Failure short reports
         if: ${{ failure() }}
         run: |
@@ -103,7 +103,7 @@ jobs:
         with:
           name: pipeline_${{ matrix.module }}_test_reports
           path: reports
-      
+
       - name: Generate Report and Notify Channel
         if: always()
         run: |
@@ -112,7 +112,7 @@ jobs:
 
   run_nightly_tests_for_other_torch_modules:
     name: Torch Non-Pipelines CUDA Nightly Tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
@@ -139,7 +139,7 @@ jobs:
       run: python utils/print_env.py
 
     - name: Run nightly PyTorch CUDA tests for non-pipeline modules
-      if: ${{ matrix.module != 'examples'}} 
+      if: ${{ matrix.module != 'examples'}}
       env:
         HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
         # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -148,7 +148,7 @@ jobs:
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "not Flax and not Onnx" \
           --make-reports=tests_torch_${{ matrix.module }}_cuda \
-          --report-log=tests_torch_${{ matrix.module }}_cuda.log \ 
+          --report-log=tests_torch_${{ matrix.module }}_cuda.log \
           tests/${{ matrix.module }}
 
     - name: Run nightly example tests with Torch
@@ -161,13 +161,13 @@ jobs:
         python -m uv pip install peft@git+https://github.com/huggingface/peft.git
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v --make-reports=examples_torch_cuda \
-          --report-log=examples_torch_cuda.log \ 
+          --report-log=examples_torch_cuda.log \
           examples/
 
     - name: Failure short reports
       if: ${{ failure() }}
       run: |
-        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt 
+        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt
         cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt
 
     - name: Test suite reports artifacts
@@ -185,7 +185,7 @@ jobs:
 
   run_lora_nightly_tests:
     name: Nightly LoRA Tests with PEFT and TORCH
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
@@ -218,13 +218,13 @@ jobs:
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "not Flax and not Onnx" \
           --make-reports=tests_torch_lora_cuda \
-          --report-log=tests_torch_lora_cuda.log \ 
+          --report-log=tests_torch_lora_cuda.log \
           tests/lora
-    
+
     - name: Failure short reports
       if: ${{ failure() }}
       run: |
-        cat reports/tests_torch_lora_cuda_stats.txt 
+        cat reports/tests_torch_lora_cuda_stats.txt
         cat reports/tests_torch_lora_cuda_failures_short.txt
 
     - name: Test suite reports artifacts
@@ -239,12 +239,12 @@ jobs:
       run: |
         pip install slack_sdk tabulate
         python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
-  
+
   run_flax_tpu_tests:
     name: Nightly Flax TPU Tests
     runs-on: docker-tpu
     if: github.event_name == 'schedule'
-    
+
     container:
       image: diffusers/diffusers-flax-tpu
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
@@ -274,7 +274,7 @@ jobs:
         python -m pytest -n 0 \
           -s -v -k "Flax" \
           --make-reports=tests_flax_tpu \
-          --report-log=tests_flax_tpu.log \ 
+          --report-log=tests_flax_tpu.log \
           tests/
 
     - name: Failure short reports
@@ -298,11 +298,11 @@ jobs:
 
   run_nightly_onnx_tests:
     name: Nightly ONNXRuntime CUDA tests on Ubuntu
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-onnxruntime-cuda
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
-    
+
     steps:
     - name: Checkout diffusers
       uses: actions/checkout@v3
@@ -321,7 +321,7 @@ jobs:
 
     - name: Environment
       run: python utils/print_env.py
-    
+
     - name: Run nightly ONNXRuntime CUDA tests
       env:
         HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
@@ -329,7 +329,7 @@ jobs:
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "Onnx" \
           --make-reports=tests_onnx_cuda \
-          --report-log=tests_onnx_cuda.log \ 
+          --report-log=tests_onnx_cuda.log \
           tests/
 
     - name: Failure short reports
@@ -344,7 +344,7 @@ jobs:
       with:
         name: ${{ matrix.config.report }}_test_reports
         path: reports
-    
+
     - name: Generate Report and Notify Channel
       if: always()
       run: |
diff --git a/.github/workflows/pr_test_fetcher.yml b/.github/workflows/pr_test_fetcher.yml
index 4dbb118c6092..7bbaaf2240a2 100644
--- a/.github/workflows/pr_test_fetcher.yml
+++ b/.github/workflows/pr_test_fetcher.yml
@@ -15,7 +15,7 @@ concurrency:
 jobs:
   setup_pr_tests:
     name: Setup PR Tests
-    runs-on: docker-cpu
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
     container:
       image: diffusers/diffusers-pytorch-cpu
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -73,7 +73,7 @@ jobs:
       max-parallel: 2
       matrix:
         modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
-    runs-on: docker-cpu
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
     container:
       image: diffusers/diffusers-pytorch-cpu
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -123,7 +123,7 @@ jobs:
         config:
           - name: Hub tests for models, schedulers, and pipelines
             framework: hub_tests_pytorch
-            runner: docker-cpu
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
             image: diffusers/diffusers-pytorch-cpu
             report: torch_hub
 
diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index a6cb123a7035..d071af2b0be2 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -21,7 +21,9 @@ env:
 jobs:
   setup_torch_cuda_pipeline_matrix:
     name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    container:
+      image: diffusers/diffusers-pytorch-cpu
     outputs:
       pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
     steps:
@@ -29,14 +31,13 @@ jobs:
         uses: actions/checkout@v3
         with:
           fetch-depth: 2
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.8"
       - name: Install dependencies
         run: |
-          pip install -e .
-          pip install huggingface_hub
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+      - name: Environment
+        run: |
+          python utils/print_env.py
       - name: Fetch Pipeline Matrix
         id: fetch_pipeline_matrix
         run: |
@@ -60,7 +61,7 @@ jobs:
     runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 --privileged
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
     steps:
       - name: Checkout diffusers
         uses: actions/checkout@v3
@@ -114,10 +115,10 @@ jobs:
 
   torch_cuda_tests:
     name: Torch CUDA Tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
     defaults:
       run:
         shell: bash
@@ -166,10 +167,10 @@ jobs:
 
   peft_cuda_tests:
     name: PEFT CUDA Tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
     defaults:
       run:
         shell: bash
@@ -219,7 +220,7 @@ jobs:
     runs-on: docker-tpu
     container:
       image: diffusers/diffusers-flax-tpu
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --privileged
     defaults:
       run:
         shell: bash
@@ -263,10 +264,10 @@ jobs:
 
   onnx_cuda_tests:
     name: ONNX CUDA Tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-onnxruntime-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
     defaults:
       run:
         shell: bash
@@ -311,11 +312,11 @@ jobs:
   run_torch_compile_tests:
     name: PyTorch Compile CUDA tests
 
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
 
     container:
       image: diffusers/diffusers-pytorch-compile-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:
     - name: Checkout diffusers
@@ -352,11 +353,11 @@ jobs:
   run_xformers_tests:
     name: PyTorch xformers CUDA tests
 
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
 
     container:
       image: diffusers/diffusers-pytorch-xformers-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:
     - name: Checkout diffusers
@@ -393,11 +394,11 @@ jobs:
   run_examples_tests:
     name: Examples PyTorch CUDA tests on Ubuntu
 
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
 
     container:
       image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:
     - name: Checkout diffusers
diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml
new file mode 100644
index 000000000000..e5bbdd64f549
--- /dev/null
+++ b/.github/workflows/ssh-runner.yml
@@ -0,0 +1,46 @@
+name: SSH into runners
+
+on:
+  workflow_dispatch:
+    inputs:
+      runner_type:
+        description: 'Type of runner to test (a10 or t4)'
+        required: true
+      docker_image:
+        description: 'Name of the Docker image'
+        required: true
+
+env:
+  IS_GITHUB_CI: "1"
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  HF_HOME: /mnt/cache
+  DIFFUSERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes
+
+jobs:
+  ssh_runner:
+    name: "SSH"
+    runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
+    container:
+      image: ${{ github.event.inputs.docker_image }}
+      options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Tailscale # In order to be able to SSH when a test fails
+        uses: huggingface/tailscale-action@v1
+        with:
+          authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
+          slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
+          slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          waitForSSH: true
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 140bdfad3a89..1c21d4cd9f74 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -62,13 +62,11 @@
   - local: using-diffusers/callback
     title: Pipeline callbacks
   - local: using-diffusers/reusing_seeds
-    title: Improve image quality with deterministic generation
-  - local: using-diffusers/control_brightness
-    title: Control image brightness
+    title: Reproducible pipelines
+  - local: using-diffusers/image_quality
+    title: Controlling image quality
   - local: using-diffusers/weighted_prompts
     title: Prompt techniques
-  - local: using-diffusers/freeu
-    title: Improve generation quality with FreeU
   title: Inference techniques
 - sections:
   - local: using-diffusers/sdxl
@@ -83,22 +81,14 @@
     title: ControlNet
   - local: using-diffusers/t2i_adapter
     title: T2I-Adapter
+  - local: using-diffusers/inference_with_lcm
+    title: Latent Consistency Model
   - local: using-diffusers/textual_inversion_inference
     title: Textual inversion
   - local: using-diffusers/shap-e
     title: Shap-E
   - local: using-diffusers/diffedit
     title: DiffEdit
-  - local: using-diffusers/reproducibility
-    title: Create reproducible pipelines
-  - local: using-diffusers/custom_pipeline_examples
-    title: Community pipelines
-  - local: using-diffusers/contribute_pipeline
-    title: Contribute a community pipeline
-  - local: using-diffusers/inference_with_lcm_lora
-    title: Latent Consistency Model-LoRA
-  - local: using-diffusers/inference_with_lcm
-    title: Latent Consistency Model
   - local: using-diffusers/inference_with_tcd_lora
     title: Trajectory Consistency Distillation-LoRA
   - local: using-diffusers/svd
@@ -149,8 +139,6 @@
 - sections:
   - local: optimization/fp16
     title: Speed up inference
-  - local: using-diffusers/distilled_sd
-    title: Distilled Stable Diffusion inference
   - local: optimization/memory
     title: Reduce memory usage
   - local: optimization/torch2.0
diff --git a/docs/source/en/api/attnprocessor.md b/docs/source/en/api/attnprocessor.md
index ab89d4d260f0..f586e9b08f2c 100644
--- a/docs/source/en/api/attnprocessor.md
+++ b/docs/source/en/api/attnprocessor.md
@@ -55,3 +55,6 @@ An attention processor is a class for applying different types of attention mech
 
 ## XFormersAttnProcessor
 [[autodoc]] models.attention_processor.XFormersAttnProcessor
+
+## AttnProcessorNPU
+[[autodoc]] models.attention_processor.AttnProcessorNPU
diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md
index 913529e6ebdc..425764541590 100644
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -101,6 +101,53 @@ AnimateDiff tends to work better with finetuned Stable Diffusion models. If you
 
 </Tip>
 
+### AnimateDiffSDXLPipeline
+
+AnimateDiff can also be used with SDXL models. This is currently an experimental feature as only a beta release of the motion adapter checkpoint is available.
+
+```python
+import torch
+from diffusers.models import MotionAdapter
+from diffusers import AnimateDiffSDXLPipeline, DDIMScheduler
+from diffusers.utils import export_to_gif
+
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-sdxl-beta", torch_dtype=torch.float16)
+
+model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+scheduler = DDIMScheduler.from_pretrained(
+    model_id,
+    subfolder="scheduler",
+    clip_sample=False,
+    timestep_spacing="linspace",
+    beta_schedule="linear",
+    steps_offset=1,
+)
+pipe = AnimateDiffSDXLPipeline.from_pretrained(
+    model_id,
+    motion_adapter=adapter,
+    scheduler=scheduler,
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda")
+
+# enable memory savings
+pipe.enable_vae_slicing()
+pipe.enable_vae_tiling()
+
+output = pipe(
+    prompt="a panda surfing in the ocean, realistic, high quality",
+    negative_prompt="low quality, worst quality",
+    num_inference_steps=20,
+    guidance_scale=8,
+    width=1024,
+    height=1024,
+    num_frames=16,
+)
+
+frames = output.frames[0]
+export_to_gif(frames, "animation.gif")
+```
+
 ### AnimateDiffVideoToVideoPipeline
 
 AnimateDiff can also be used to generate visually similar videos or enable style/character/background or other edits starting from an initial video, allowing you to seamlessly explore creative possibilities.
@@ -522,6 +569,12 @@ export_to_gif(frames, "animatelcm-motion-lora.gif")
   - all
   - __call__
 
+## AnimateDiffSDXLPipeline
+
+[[autodoc]] AnimateDiffSDXLPipeline
+  - all
+  - __call__
+
 ## AnimateDiffVideoToVideoPipeline
 
 [[autodoc]] AnimateDiffVideoToVideoPipeline
diff --git a/docs/source/en/api/pipelines/overview.md b/docs/source/en/api/pipelines/overview.md
index cd1232a90d6e..e7b8bf4936c0 100644
--- a/docs/source/en/api/pipelines/overview.md
+++ b/docs/source/en/api/pipelines/overview.md
@@ -97,6 +97,11 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 	- to
 	- components
 
+
+[[autodoc]] pipelines.StableDiffusionMixin.enable_freeu
+
+[[autodoc]] pipelines.StableDiffusionMixin.disable_freeu
+
 ## FlaxDiffusionPipeline
 
 [[autodoc]] pipelines.pipeline_flax_utils.FlaxDiffusionPipeline
diff --git a/docs/source/en/api/utilities.md b/docs/source/en/api/utilities.md
index 71253db215ab..d4f4d7d7964f 100644
--- a/docs/source/en/api/utilities.md
+++ b/docs/source/en/api/utilities.md
@@ -37,3 +37,7 @@ Utility and helper functions for working with 🤗 Diffusers.
 ## make_image_grid
 
 [[autodoc]] utils.make_image_grid
+
+## randn_tensor
+
+[[autodoc]] utils.torch_utils.randn_tensor
diff --git a/docs/source/en/conceptual/contribution.md b/docs/source/en/conceptual/contribution.md
index 24ac52ba19c9..cc2e0ae07b2c 100644
--- a/docs/source/en/conceptual/contribution.md
+++ b/docs/source/en/conceptual/contribution.md
@@ -198,38 +198,81 @@ Anything displayed on [the official Diffusers doc page](https://huggingface.co/d
 
 Please have a look at [this page](https://github.com/huggingface/diffusers/tree/main/docs) on how to verify changes made to the documentation locally.
 
-
 ### 6. Contribute a community pipeline
 
-[Pipelines](https://huggingface.co/docs/diffusers/api/pipelines/overview) are usually the first point of contact between the Diffusers library and the user.
-Pipelines are examples of how to use Diffusers [models](https://huggingface.co/docs/diffusers/api/models/overview) and [schedulers](https://huggingface.co/docs/diffusers/api/schedulers/overview).
-We support two types of pipelines:
+> [!TIP]
+> Read the [Community pipelines](../using-diffusers/custom_pipeline_overview#community-pipelines) guide to learn more about the difference between a GitHub and Hugging Face Hub community pipeline. If you're interested in why we have community pipelines, take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) (basically, we can't maintain all the possible ways diffusion models can be used for inference but we also don't want to prevent the community from building them).
+
+Contributing a community pipeline is a great way to share your creativity and work with the community. It lets you build on top of the [`DiffusionPipeline`] so that anyone can load and use it by setting the `custom_pipeline` parameter. This section will walk you through how to create a simple pipeline where the UNet only does a single forward pass and calls the scheduler once (a "one-step" pipeline).
+
+1. Create a one_step_unet.py file for your community pipeline. This file can contain whatever package you want to use as long as it's installed by the user. Make sure you only have one pipeline class that inherits from [`DiffusionPipeline`] to load model weights and the scheduler configuration from the Hub. Add a UNet and scheduler to the `__init__` function.
+
+    You should also add the `register_modules` function to ensure your pipeline and its components can be saved with [`~DiffusionPipeline.save_pretrained`].
+
+```py
+from diffusers import DiffusionPipeline
+import torch
+
+class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+    def __init__(self, unet, scheduler):
+        super().__init__()
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+```
+
+1. In the forward pass (which we recommend defining as `__call__`), you can add any feature you'd like. For the "one-step" pipeline, create a random image and call the UNet and scheduler once by setting `timestep=1`.
+
+```py
+  from diffusers import DiffusionPipeline
+  import torch
+
+  class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+      def __init__(self, unet, scheduler):
+          super().__init__()
+
+          self.register_modules(unet=unet, scheduler=scheduler)
 
-- Official Pipelines
-- Community Pipelines
+      def __call__(self):
+          image = torch.randn(
+              (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
+          )
+          timestep = 1
+
+          model_output = self.unet(image, timestep).sample
+          scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
+
+          return scheduler_output
+```
 
-Both official and community pipelines follow the same design and consist of the same type of components.
+Now you can run the pipeline by passing a UNet and scheduler to it or load pretrained weights if the pipeline structure is identical.
+
+```py
+from diffusers import DDPMScheduler, UNet2DModel
+
+scheduler = DDPMScheduler()
+unet = UNet2DModel()
+
+pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
+output = pipeline()
+# load pretrained weights
+pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
+output = pipeline()
+```
 
-Official pipelines are tested and maintained by the core maintainers of Diffusers. Their code
-resides in [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).
-In contrast, community pipelines are contributed and maintained purely by the **community** and are **not** tested.
-They reside in [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and while they can be accessed via the [PyPI diffusers package](https://pypi.org/project/diffusers/), their code is not part of the PyPI distribution.
+You can either share your pipeline as a GitHub community pipeline or Hub community pipeline.
 
-The reason for the distinction is that the core maintainers of the Diffusers library cannot maintain and test all
-possible ways diffusion models can be used for inference, but some of them may be of interest to the community.
-Officially released diffusion pipelines,
-such as Stable Diffusion are added to the core src/diffusers/pipelines package which ensures
-high quality of maintenance, no backward-breaking code changes, and testing.
-More bleeding edge pipelines should be added as community pipelines. If usage for a community pipeline is high, the pipeline can be moved to the official pipelines upon request from the community. This is one of the ways we strive to be a community-driven library.
+<hfoptions id="pipeline type">
+<hfoption id="GitHub pipeline">
 
-To add a community pipeline, one should add a <name-of-the-community>.py file to [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and adapt the [examples/community/README.md](https://github.com/huggingface/diffusers/tree/main/examples/community/README.md) to include an example of the new pipeline.
+Share your GitHub pipeline by opening a pull request on the Diffusers [repository](https://github.com/huggingface/diffusers) and add the one_step_unet.py file to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder.
 
-An example can be seen [here](https://github.com/huggingface/diffusers/pull/2400).
+</hfoption>
+<hfoption id="Hub pipeline">
 
-Community pipeline PRs are only checked at a superficial level and ideally they should be maintained by their original authors.
+Share your Hub pipeline by creating a model repository on the Hub and uploading the one_step_unet.py file to it.
 
-Contributing a community pipeline is a great way to understand how Diffusers models and schedulers work. Having contributed a community pipeline is usually the first stepping stone to contributing an official pipeline to the
-core package.
+</hfoption>
+</hfoptions>
 
 ### 7. Contribute to training examples
 
diff --git a/docs/source/en/optimization/fp16.md b/docs/source/en/optimization/fp16.md
index 7a2cf934985c..b21b61368826 100644
--- a/docs/source/en/optimization/fp16.md
+++ b/docs/source/en/optimization/fp16.md
@@ -12,27 +12,23 @@ specific language governing permissions and limitations under the License.
 
 # Speed up inference
 
-There are several ways to optimize 🤗 Diffusers for inference speed. As a general rule of thumb, we recommend using either [xFormers](xformers) or `torch.nn.functional.scaled_dot_product_attention` in PyTorch 2.0 for their memory-efficient attention.
+There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attetntion](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.
 
-<Tip>
+> [!TIP]
+> Optimizing for inference speed or reduced memory usage can lead to improved performance in the other category, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about lowering memory usage in the [Reduce memory usage](memory) guide.
 
-In many cases, optimizing for speed or memory leads to improved performance in the other, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about preserving memory in the [Reduce memory usage](memory) guide.
+The inference times below are obtained from generating a single 512x512 image from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps on a NVIDIA A100.
 
-</Tip>
+| setup    | latency | speed-up |
+|----------|---------|----------|
+| baseline | 5.27s   | x1       |
+| tf32     | 4.14s   | x1.27    |
+| fp16     | 3.51s   | x1.50    |
+| combined | 3.41s   | x1.54    |
 
-The results below are obtained from generating a single 512x512 image from the prompt `a photo of an astronaut riding a horse on mars` with 50 DDIM steps on a Nvidia Titan RTX, demonstrating the speed-up you can expect.
+## TensorFloat-32
 
-|                  | latency | speed-up |
-| ---------------- | ------- | ------- |
-| original         | 9.50s   | x1      |
-| fp16             | 3.61s   | x2.63   |
-| channels last    | 3.30s   | x2.88   |
-| traced UNet      | 3.21s   | x2.96   |
-| memory efficient attention  | 2.63s  | x3.61   |
-
-## Use TensorFloat-32
-
-On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (TF32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables TF32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling TF32 for matrix multiplications. It can significantly speeds up computations with typically negligible loss in numerical accuracy.
+On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (tf32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables tf32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling tf32 for matrix multiplications. It can significantly speed up computations with typically negligible loss in numerical accuracy.
 
 ```python
 import torch
@@ -40,11 +36,11 @@ import torch
 torch.backends.cuda.matmul.allow_tf32 = True
 ```
 
-You can learn more about TF32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.
+Learn more about tf32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.
 
 ## Half-precision weights
 
-To save GPU memory and get more speed, try loading and running the model weights directly in half-precision or float16:
+To save GPU memory and get more speed, set `torch_dtype=torch.float16` to load and run the model weights directly with half-precision weights.
 
 ```Python
 import torch
@@ -56,19 +52,76 @@ pipe = DiffusionPipeline.from_pretrained(
     use_safetensors=True,
 )
 pipe = pipe.to("cuda")
-
-prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt).images[0]
 ```
 
-<Tip warning={true}>
+> [!WARNING]
+> Don't use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.
 
-Don't use [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.
+## Distilled model
 
-</Tip>
+You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size by 51% and improve latency on CPU/GPU by 43%. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model.
 
-## Distilled model
+> [!TIP]
+> Read the [Open-sourcing Knowledge Distillation Code and Weights of SD-Small and SD-Tiny](https://huggingface.co/blog/sd_distillation) blog post to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model.
 
-You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model.
+The inference times below are obtained from generating 4 images from the prompt "a photo of an astronaut riding a horse on mars" with 25 PNDM steps on a NVIDIA A100. Each generation is repeated 3 times with the distilled Stable Diffusion v1.4 model by [Nota AI](https://hf.co/nota-ai).
+
+| setup                        | latency | speed-up |
+|------------------------------|---------|----------|
+| baseline                     | 6.37s   | x1       |
+| distilled                    | 4.18s   | x1.52    |
+| distilled + tiny autoencoder | 3.83s   | x1.66    |
+
+Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model.
+
+```py
+from diffusers import StableDiffusionPipeline
+import torch
+
+distilled = StableDiffusionPipeline.from_pretrained(
+    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+prompt = "a golden vase with different flowers"
+generator = torch.manual_seed(2023)
+image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/original_sd.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original Stable Diffusion</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion</figcaption>
+  </div>
+</div>
+
+### Tiny AutoEncoder
+
+To speed inference up even more, replace the autoencoder with a [distilled version](https://huggingface.co/sayakpaul/taesdxl-diffusers) of it.
+
+```py
+import torch
+from diffusers import AutoencoderTiny, StableDiffusionPipeline
+
+distilled = StableDiffusionPipeline.from_pretrained(
+    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+distilled.vae = AutoencoderTiny.from_pretrained(
+    "sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+
+prompt = "a golden vase with different flowers"
+generator = torch.manual_seed(2023)
+image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
+image
+```
 
-Learn more about in the [Distilled Stable Diffusion inference](../using-diffusers/distilled_sd) guide!
+<div class="flex justify-center">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd_vae.png" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder</figcaption>
+  </div>
+</div>
diff --git a/docs/source/en/stable_diffusion.md b/docs/source/en/stable_diffusion.md
index 877a4ac70823..db4953ebbffd 100644
--- a/docs/source/en/stable_diffusion.md
+++ b/docs/source/en/stable_diffusion.md
@@ -49,7 +49,7 @@ One of the simplest ways to speed up inference is to place the pipeline on a GPU
 pipeline = pipeline.to("cuda")
 ```
 
-To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reproducibility):
+To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reusing_seeds):
 
 ```python
 import torch
diff --git a/docs/source/en/training/kandinsky.md b/docs/source/en/training/kandinsky.md
index 2caec1035fa9..a1854d76c492 100644
--- a/docs/source/en/training/kandinsky.md
+++ b/docs/source/en/training/kandinsky.md
@@ -205,7 +205,7 @@ model_pred = unet(noisy_latents, timesteps, None, added_cond_kwargs=added_cond_k
 
 Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀
 
-You'll train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon, but you can also create and train on your own dataset by following the [Create a dataset for training](create_dataset) guide. Set the environment variable `DATASET_NAME` to the name of the dataset on the Hub or if you're training on your own files, set the environment variable `TRAIN_DIR` to a path to your dataset.
+You'll train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters, but you can also create and train on your own dataset by following the [Create a dataset for training](create_dataset) guide. Set the environment variable `DATASET_NAME` to the name of the dataset on the Hub or if you're training on your own files, set the environment variable `TRAIN_DIR` to a path to your dataset.
 
 If you’re training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
 
@@ -219,7 +219,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb`
 <hfoption id="prior model">
 
 ```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch --mixed_precision="fp16"  train_text_to_image_prior.py \
   --dataset_name=$DATASET_NAME \
@@ -232,17 +232,17 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image_prior.py \
   --checkpoints_total_limit=3 \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
-  --validation_prompts="A robot pokemon, 4k photo" \
+  --validation_prompts="A robot naruto, 4k photo" \
   --report_to="wandb" \
   --push_to_hub \
-  --output_dir="kandi2-prior-pokemon-model"
+  --output_dir="kandi2-prior-naruto-model"
 ```
 
 </hfoption>
 <hfoption id="decoder model">
 
 ```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch --mixed_precision="fp16"  train_text_to_image_decoder.py \
   --dataset_name=$DATASET_NAME \
@@ -256,10 +256,10 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image_decoder.py \
   --checkpoints_total_limit=3 \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
-  --validation_prompts="A robot pokemon, 4k photo" \
+  --validation_prompts="A robot naruto, 4k photo" \
   --report_to="wandb" \
   --push_to_hub \
-  --output_dir="kandi2-decoder-pokemon-model"
+  --output_dir="kandi2-decoder-naruto-model"
 ```
 
 </hfoption>
@@ -279,7 +279,7 @@ prior_components = {"prior_" + k: v for k,v in prior_pipeline.components.items()
 pipeline = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", **prior_components, torch_dtype=torch.float16)
 
 pipe.enable_model_cpu_offload()
-prompt="A robot pokemon, 4k photo"
+prompt="A robot naruto, 4k photo"
 image = pipeline(prompt=prompt, negative_prompt=negative_prompt).images[0]
 ```
 
@@ -299,7 +299,7 @@ import torch
 pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torch_dtype=torch.float16)
 pipeline.enable_model_cpu_offload()
 
-prompt="A robot pokemon, 4k photo"
+prompt="A robot naruto, 4k photo"
 image = pipeline(prompt=prompt).images[0]
 ```
 
@@ -313,7 +313,7 @@ unet = UNet2DConditionModel.from_pretrained("path/to/saved/model" + "/checkpoint
 pipeline = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", unet=unet, torch_dtype=torch.float16)
 pipeline.enable_model_cpu_offload()
 
-image = pipeline(prompt="A robot pokemon, 4k photo").images[0]
+image = pipeline(prompt="A robot naruto, 4k photo").images[0]
 ```
 
 </hfoption>
diff --git a/docs/source/en/training/lora.md b/docs/source/en/training/lora.md
index 78ac8a140e7c..737e6f0dfc32 100644
--- a/docs/source/en/training/lora.md
+++ b/docs/source/en/training/lora.md
@@ -170,7 +170,7 @@ Aside from setting up the LoRA layers, the training script is more or less the s
 
 Once you've made all your changes or you're okay with the default configuration, you're ready to launch the training script! 🚀
 
-Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate our own Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository:
+Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository:
 
 - saved model checkpoints
 - `pytorch_lora_weights.safetensors` (the trained LoRA weights)
@@ -185,9 +185,9 @@ A full training run takes ~5 hours on a 2080 Ti GPU with 11GB of VRAM.
 
 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export OUTPUT_DIR="/sddata/finetune/lora/pokemon"
-export HUB_MODEL_ID="pokemon-lora"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export OUTPUT_DIR="/sddata/finetune/lora/naruto"
+export HUB_MODEL_ID="naruto-lora"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch --mixed_precision="fp16"  train_text_to_image_lora.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
@@ -208,7 +208,7 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image_lora.py \
   --hub_model_id=${HUB_MODEL_ID} \
   --report_to=wandb \
   --checkpointing_steps=500 \
-  --validation_prompt="A pokemon with blue eyes." \
+  --validation_prompt="A naruto with blue eyes." \
   --seed=1337
 ```
 
@@ -220,7 +220,7 @@ import torch
 
 pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
 pipeline.load_lora_weights("path/to/lora/model", weight_name="pytorch_lora_weights.safetensors")
-image = pipeline("A pokemon with blue eyes").images[0]
+image = pipeline("A naruto with blue eyes").images[0]
 ```
 
 ## Next steps
diff --git a/docs/source/en/training/sdxl.md b/docs/source/en/training/sdxl.md
index 0e51e720b48c..78178047d9fd 100644
--- a/docs/source/en/training/sdxl.md
+++ b/docs/source/en/training/sdxl.md
@@ -176,7 +176,7 @@ If you want to learn more about how the training loop works, check out the [Unde
 
 Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀
 
-Let’s train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and the dataset (either from the Hub or a local path). You should also specify a VAE other than the SDXL VAE (either from the Hub or a local path) with `VAE_NAME` to avoid numerical instabilities.
+Let’s train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and the dataset (either from the Hub or a local path). You should also specify a VAE other than the SDXL VAE (either from the Hub or a local path) with `VAE_NAME` to avoid numerical instabilities.
 
 <Tip>
 
@@ -187,7 +187,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb`
 ```bash
 export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
 export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch train_text_to_image_sdxl.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
@@ -211,7 +211,7 @@ accelerate launch train_text_to_image_sdxl.py \
   --validation_prompt="a cute Sundar Pichai creature" \
   --validation_epochs 5 \
   --checkpointing_steps=5000 \
-  --output_dir="sdxl-pokemon-model" \
+  --output_dir="sdxl-naruto-model" \
   --push_to_hub
 ```
 
@@ -226,9 +226,9 @@ import torch
 
 pipeline = DiffusionPipeline.from_pretrained("path/to/your/model", torch_dtype=torch.float16).to("cuda")
 
-prompt = "A pokemon with green eyes and red legs."
+prompt = "A naruto with green eyes and red legs."
 image = pipeline(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
-image.save("pokemon.png")
+image.save("naruto.png")
 ```
 
 </hfoption>
@@ -244,11 +244,11 @@ import torch_xla.core.xla_model as xm
 device = xm.xla_device()
 pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0").to(device)
 
-prompt = "A pokemon with green eyes and red legs."
+prompt = "A naruto with green eyes and red legs."
 start = time()
 image = pipeline(prompt, num_inference_steps=inference_steps).images[0]
 print(f'Compilation time is {time()-start} sec')
-image.save("pokemon.png")
+image.save("naruto.png")
 
 start = time()
 image = pipeline(prompt, num_inference_steps=inference_steps).images[0]
diff --git a/docs/source/en/training/text2image.md b/docs/source/en/training/text2image.md
index d5c772c9db86..f69e9a710e8f 100644
--- a/docs/source/en/training/text2image.md
+++ b/docs/source/en/training/text2image.md
@@ -158,7 +158,7 @@ Once you've made all your changes or you're okay with the default configuration,
 <hfoptions id="training-inference">
 <hfoption id="PyTorch">
 
-Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon. Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path). If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
+Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path). If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
 
 <Tip>
 
@@ -168,7 +168,7 @@ To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment va
 
 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export dataset_name="lambdalabs/pokemon-blip-captions"
+export dataset_name="lambdalabs/naruto-blip-captions"
 
 accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
@@ -183,7 +183,7 @@ accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
   --max_grad_norm=1 \
   --enable_xformers_memory_efficient_attention
   --lr_scheduler="constant" --lr_warmup_steps=0 \
-  --output_dir="sd-pokemon-model" \
+  --output_dir="sd-naruto-model" \
   --push_to_hub
 ```
 
@@ -202,7 +202,7 @@ To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment va
 
 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export dataset_name="lambdalabs/pokemon-blip-captions"
+export dataset_name="lambdalabs/naruto-blip-captions"
 
 python train_text_to_image_flax.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
@@ -212,7 +212,7 @@ python train_text_to_image_flax.py \
   --max_train_steps=15000 \
   --learning_rate=1e-05 \
   --max_grad_norm=1 \
-  --output_dir="sd-pokemon-model" \
+  --output_dir="sd-naruto-model" \
   --push_to_hub
 ```
 
@@ -231,7 +231,7 @@ import torch
 pipeline = StableDiffusionPipeline.from_pretrained("path/to/saved_model", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
 
 image = pipeline(prompt="yoda").images[0]
-image.save("yoda-pokemon.png")
+image.save("yoda-naruto.png")
 ```
 
 </hfoption>
@@ -246,7 +246,7 @@ from diffusers import FlaxStableDiffusionPipeline
 
 pipeline, params = FlaxStableDiffusionPipeline.from_pretrained("path/to/saved_model", dtype=jax.numpy.bfloat16)
 
-prompt = "yoda pokemon"
+prompt = "yoda naruto"
 prng_seed = jax.random.PRNGKey(0)
 num_inference_steps = 50
 
@@ -261,7 +261,7 @@ prompt_ids = shard(prompt_ids)
 
 images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
 images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
-image.save("yoda-pokemon.png")
+image.save("yoda-naruto.png")
 ```
 
 </hfoption>
diff --git a/docs/source/en/training/wuerstchen.md b/docs/source/en/training/wuerstchen.md
index c8d2842eb833..cd190639b865 100644
--- a/docs/source/en/training/wuerstchen.md
+++ b/docs/source/en/training/wuerstchen.md
@@ -131,7 +131,7 @@ If you want to learn more about how the training loop works, check out the [Unde
 
 Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀
 
-Set the `DATASET_NAME` environment variable to the dataset name from the Hub. This guide uses the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset, but you can create and train on your own datasets as well (see the [Create a dataset for training](create_dataset) guide).
+Set the `DATASET_NAME` environment variable to the dataset name from the Hub. This guide uses the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset, but you can create and train on your own datasets as well (see the [Create a dataset for training](create_dataset) guide).
 
 <Tip>
 
@@ -140,7 +140,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb`
 </Tip>
 
 ```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch  train_text_to_image_prior.py \
   --mixed_precision="fp16" \
@@ -156,10 +156,10 @@ accelerate launch  train_text_to_image_prior.py \
   --checkpoints_total_limit=3 \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
-  --validation_prompts="A robot pokemon, 4k photo" \
+  --validation_prompts="A robot naruto, 4k photo" \
   --report_to="wandb" \
   --push_to_hub \
-  --output_dir="wuerstchen-prior-pokemon-model"
+  --output_dir="wuerstchen-prior-naruto-model"
 ```
 
 Once training is complete, you can use your newly trained model for inference!
@@ -171,7 +171,7 @@ from diffusers.pipelines.wuerstchen import DEFAULT_STAGE_C_TIMESTEPS
 
 pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torch_dtype=torch.float16).to("cuda")
 
-caption = "A cute bird pokemon holding a shield"
+caption = "A cute bird naruto holding a shield"
 images = pipeline(
     caption,
     width=1024,
diff --git a/docs/source/en/using-diffusers/contribute_pipeline.md b/docs/source/en/using-diffusers/contribute_pipeline.md
deleted file mode 100644
index e9cf1ed1ce02..000000000000
--- a/docs/source/en/using-diffusers/contribute_pipeline.md
+++ /dev/null
@@ -1,184 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Contribute a community pipeline
-
-<Tip>
-
-💡 Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down.
-
-</Tip>
-
-Community pipelines allow you to add any additional features you'd like on top of the [`DiffusionPipeline`]. The main benefit of building on top of the `DiffusionPipeline` is anyone can load and use your pipeline by only adding one more argument, making it super easy for the community to access.
-
-This guide will show you how to create a community pipeline and explain how they work. To keep things simple, you'll create a "one-step" pipeline where the `UNet` does a single forward pass and calls the scheduler once.
-
-## Initialize the pipeline
-
-You should start by creating a `one_step_unet.py` file for your community pipeline. In this file, create a pipeline class that inherits from the [`DiffusionPipeline`] to be able to load model weights and the scheduler configuration from the Hub. The one-step pipeline needs a `UNet` and a scheduler, so you'll need to add these as arguments to the `__init__` function:
-
-```python
-from diffusers import DiffusionPipeline
-import torch
-
-class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
-    def __init__(self, unet, scheduler):
-        super().__init__()
-```
-
-To ensure your pipeline and its components (`unet` and `scheduler`) can be saved with [`~DiffusionPipeline.save_pretrained`], add them to the `register_modules` function:
-
-```diff
-  from diffusers import DiffusionPipeline
-  import torch
-
-  class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
-      def __init__(self, unet, scheduler):
-          super().__init__()
-
-+         self.register_modules(unet=unet, scheduler=scheduler)
-```
-
-Cool, the `__init__` step is done and you can move to the forward pass now! 🔥
-
-## Define the forward pass
-
-In the forward pass, which we recommend defining as `__call__`, you have complete creative freedom to add whatever feature you'd like. For our amazing one-step pipeline, create a random image and only call the `unet` and `scheduler` once by setting `timestep=1`:
-
-```diff
-  from diffusers import DiffusionPipeline
-  import torch
-
-  class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
-      def __init__(self, unet, scheduler):
-          super().__init__()
-
-          self.register_modules(unet=unet, scheduler=scheduler)
-
-+     def __call__(self):
-+         image = torch.randn(
-+             (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
-+         )
-+         timestep = 1
-
-+         model_output = self.unet(image, timestep).sample
-+         scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
-
-+         return scheduler_output
-```
-
-That's it! 🚀 You can now run this pipeline by passing a `unet` and `scheduler` to it:
-
-```python
-from diffusers import DDPMScheduler, UNet2DModel
-
-scheduler = DDPMScheduler()
-unet = UNet2DModel()
-
-pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
-
-output = pipeline()
-```
-
-But what's even better is you can load pre-existing weights into the pipeline if the pipeline structure is identical. For example, you can load the [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32) weights into the one-step pipeline:
-
-```python
-pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
-
-output = pipeline()
-```
-
-## Share your pipeline
-
-Open a Pull Request on the 🧨 Diffusers [repository](https://github.com/huggingface/diffusers) to add your awesome pipeline in `one_step_unet.py` to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder.
-
-Once it is merged, anyone with `diffusers >= 0.4.0` installed can use this pipeline magically 🪄 by specifying it in the `custom_pipeline` argument:
-
-```python
-from diffusers import DiffusionPipeline
-
-pipe = DiffusionPipeline.from_pretrained(
-    "google/ddpm-cifar10-32", custom_pipeline="one_step_unet", use_safetensors=True
-)
-pipe()
-```
-
-Another way to share your community pipeline is to upload the `one_step_unet.py` file directly to your preferred [model repository](https://huggingface.co/docs/hub/models-uploading) on the Hub. Instead of specifying the `one_step_unet.py` file, pass the model repository id to the `custom_pipeline` argument:
-
-```python
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "google/ddpm-cifar10-32", custom_pipeline="stevhliu/one_step_unet", use_safetensors=True
-)
-```
-
-Take a look at the following table to compare the two sharing workflows to help you decide the best option for you:
-
-|                | GitHub community pipeline                                                                                        | HF Hub community pipeline                                                                 |
-|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
-| usage          | same                                                                                                             | same                                                                                      |
-| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow      |
-| visibility     | included in the official Diffusers repository and documentation                                                  | included on your HF Hub profile and relies on your own usage/promotion to gain visibility |
-
-<Tip>
-
-💡 You can use whatever package you want in your community pipeline file - as long as the user has it installed, everything will work fine. Make sure you have one and only one pipeline class that inherits from `DiffusionPipeline` because this is automatically detected.
-
-</Tip>
-
-## How do community pipelines work?
-
-A community pipeline is a class that inherits from [`DiffusionPipeline`] which means:
-
-- It can be loaded with the [`custom_pipeline`] argument.
-- The model weights and scheduler configuration are loaded from [`pretrained_model_name_or_path`].
-- The code that implements a feature in the community pipeline is defined in a `pipeline.py` file.
-
-Sometimes you can't load all the pipeline components weights from an official repository. In this case, the other components should be passed directly to the pipeline:
-
-```python
-from diffusers import DiffusionPipeline
-from transformers import CLIPImageProcessor, CLIPModel
-
-model_id = "CompVis/stable-diffusion-v1-4"
-clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
-
-feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id)
-clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16)
-
-pipeline = DiffusionPipeline.from_pretrained(
-    model_id,
-    custom_pipeline="clip_guided_stable_diffusion",
-    clip_model=clip_model,
-    feature_extractor=feature_extractor,
-    scheduler=scheduler,
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-)
-```
-
-The magic behind community pipelines is contained in the following code. It allows the community pipeline to be loaded from GitHub or the Hub, and it'll be available to all 🧨 Diffusers packages.
-
-```python
-# 2. Load the pipeline class, if using custom module then load it from the Hub
-# if we load from explicit class, let's use it
-if custom_pipeline is not None:
-    pipeline_class = get_class_from_dynamic_module(
-        custom_pipeline, module_file=CUSTOM_PIPELINE_FILE_NAME, cache_dir=custom_pipeline
-    )
-elif cls != DiffusionPipeline:
-    pipeline_class = cls
-else:
-    diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
-    pipeline_class = getattr(diffusers_module, config_dict["_class_name"])
-```
diff --git a/docs/source/en/using-diffusers/control_brightness.md b/docs/source/en/using-diffusers/control_brightness.md
deleted file mode 100644
index 5fad664f60f3..000000000000
--- a/docs/source/en/using-diffusers/control_brightness.md
+++ /dev/null
@@ -1,58 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Control image brightness
-
-The Stable Diffusion pipeline is mediocre at generating images that are either very bright or dark as explained in the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) paper. The solutions proposed in the paper are currently implemented in the [`DDIMScheduler`] which you can use to improve the lighting in your images.
-
-<Tip>
-
-💡 Take a look at the paper linked above for more details about the proposed solutions!
-
-</Tip>
-
-One of the solutions is to train a model with *v prediction* and *v loss*. Add the following flag to the [`train_text_to_image.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [`train_text_to_image_lora.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts to enable `v_prediction`:
-
-```bash
---prediction_type="v_prediction"
-```
-
-For example, let's use the [`ptx0/pseudo-journey-v2`](https://huggingface.co/ptx0/pseudo-journey-v2) checkpoint which has been finetuned with `v_prediction`.
-
-Next, configure the following parameters in the [`DDIMScheduler`]:
-
-1. `rescale_betas_zero_snr=True`, rescales the noise schedule to zero terminal signal-to-noise ratio (SNR)
-2. `timestep_spacing="trailing"`, starts sampling from the last timestep
-
-```py
-from diffusers import DiffusionPipeline, DDIMScheduler
-
-pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
-
-# switch the scheduler in the pipeline to use the DDIMScheduler
-pipeline.scheduler = DDIMScheduler.from_config(
-    pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
-)
-pipeline.to("cuda")
-```
-
-Finally, in your call to the pipeline, set `guidance_rescale` to prevent overexposure:
-
-```py
-prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"
-image = pipeline(prompt, guidance_rescale=0.7).images[0]
-image
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero_snr.png"/>
-</div>
diff --git a/docs/source/en/using-diffusers/custom_pipeline_examples.md b/docs/source/en/using-diffusers/custom_pipeline_examples.md
deleted file mode 100644
index 203302ed3ead..000000000000
--- a/docs/source/en/using-diffusers/custom_pipeline_examples.md
+++ /dev/null
@@ -1,119 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Community pipelines
-
-[[open-in-colab]]
-
-<Tip>
-
-For more context about the design choices behind community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).
-
-</Tip>
-
-Community pipelines allow you to get creative and build your own unique pipelines to share with the community. You can find all community pipelines in the [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) folder along with inference and training examples for how to use them. This guide showcases some of the community pipelines and hopefully it'll inspire you to create your own (feel free to open a PR with your own pipeline and we will merge it!).
-
-To load a community pipeline, use the `custom_pipeline` argument in [`DiffusionPipeline`] to specify one of the files in [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community):
-
-```py
-from diffusers import DiffusionPipeline
-
-pipe = DiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", custom_pipeline="filename_in_the_community_folder", use_safetensors=True
-)
-```
-
-If a community pipeline doesn't work as expected, please open a GitHub issue and mention the author.
-
-You can learn more about community pipelines in the how to [load community pipelines](custom_pipeline_overview) and how to [contribute a community pipeline](contribute_pipeline) guides.
-
-## Multilingual Stable Diffusion
-
-The multilingual Stable Diffusion pipeline uses a pretrained [XLM-RoBERTa](https://huggingface.co/papluca/xlm-roberta-base-language-detection) to identify a language and the [mBART-large-50](https://huggingface.co/facebook/mbart-large-50-many-to-one-mmt) model to handle the translation. This allows you to generate images from text in 20 languages.
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-from diffusers.utils import make_image_grid
-from transformers import (
-    pipeline,
-    MBart50TokenizerFast,
-    MBartForConditionalGeneration,
-)
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-device_dict = {"cuda": 0, "cpu": -1}
-
-# add language detection pipeline
-language_detection_model_ckpt = "papluca/xlm-roberta-base-language-detection"
-language_detection_pipeline = pipeline("text-classification",
-                                       model=language_detection_model_ckpt,
-                                       device=device_dict[device])
-
-# add model for language translation
-translation_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
-translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-one-mmt").to(device)
-
-diffuser_pipeline = DiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    custom_pipeline="multilingual_stable_diffusion",
-    detection_pipeline=language_detection_pipeline,
-    translation_model=translation_model,
-    translation_tokenizer=translation_tokenizer,
-    torch_dtype=torch.float16,
-)
-
-diffuser_pipeline.enable_attention_slicing()
-diffuser_pipeline = diffuser_pipeline.to(device)
-
-prompt = ["a photograph of an astronaut riding a horse",
-          "Una casa en la playa",
-          "Ein Hund, der Orange isst",
-          "Un restaurant parisien"]
-
-images = diffuser_pipeline(prompt).images
-make_image_grid(images, rows=2, cols=2)
-```
-
-<div class="flex justify-center">
-    <img src="https://user-images.githubusercontent.com/4313860/198328706-295824a4-9856-4ce5-8e66-278ceb42fd29.png"/>
-</div>
-
-## MagicMix
-
-[MagicMix](https://huggingface.co/papers/2210.16056) is a pipeline that can mix an image and text prompt to generate a new image that preserves the image structure. The `mix_factor` determines how much influence the prompt has on the layout generation, `kmin` controls the number of steps during the content generation process, and `kmax` determines how much information is kept in the layout of the original image.
-
-```py
-from diffusers import DiffusionPipeline, DDIMScheduler
-from diffusers.utils import load_image, make_image_grid
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    custom_pipeline="magic_mix",
-    scheduler=DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler"),
-).to('cuda')
-
-img = load_image("https://user-images.githubusercontent.com/59410571/209578593-141467c7-d831-4792-8b9a-b17dc5e47816.jpg")
-mix_img = pipeline(img, prompt="bed", kmin=0.3, kmax=0.5, mix_factor=0.5)
-make_image_grid([img, mix_img], rows=1, cols=2)
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://user-images.githubusercontent.com/59410571/209578593-141467c7-d831-4792-8b9a-b17dc5e47816.jpg" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://user-images.githubusercontent.com/59410571/209578602-70f323fa-05b7-4dd6-b055-e40683e37914.jpg" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">image and text prompt mix</figcaption>
-  </div>
-</div>
diff --git a/docs/source/en/using-diffusers/custom_pipeline_overview.md b/docs/source/en/using-diffusers/custom_pipeline_overview.md
index 0b6bb53f10d6..ef26e546e4d4 100644
--- a/docs/source/en/using-diffusers/custom_pipeline_overview.md
+++ b/docs/source/en/using-diffusers/custom_pipeline_overview.md
@@ -16,11 +16,19 @@ specific language governing permissions and limitations under the License.
 
 ## Community pipelines
 
+> [!TIP] Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down.
+
 Community pipelines are any [`DiffusionPipeline`] class that are different from the original paper implementation (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline.
 
 There are many cool community pipelines like [Marigold Depth Estimation](https://github.com/huggingface/diffusers/tree/main/examples/community#marigold-depth-estimation) or [InstantID](https://github.com/huggingface/diffusers/tree/main/examples/community#instantid-pipeline), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community).
 
-There are two types of community pipelines, those stored on the Hugging Face Hub and those stored on Diffusers GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while Diffusers GitHub pipelines are only limited to custom pipeline code. Refer to this [table](./contribute_pipeline#share-your-pipeline) for a more detailed comparison of Hub vs GitHub community pipelines.
+There are two types of community pipelines, those stored on the Hugging Face Hub and those stored on Diffusers GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while Diffusers GitHub pipelines are only limited to custom pipeline code.
+
+|                | GitHub community pipeline                                                                                        | HF Hub community pipeline                                                                 |
+|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
+| usage          | same                                                                                                             | same                                                                                      |
+| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow      |
+| visibility     | included in the official Diffusers repository and documentation                                                  | included on your HF Hub profile and relies on your own usage/promotion to gain visibility |
 
 <hfoptions id="community">
 <hfoption id="Hub pipelines">
@@ -161,6 +169,97 @@ out_lpw
   </div>
 </div>
 
+## Example community pipelines
+
+Community pipelines are a really fun and creative way to extend the capabilities of the original pipeline with new and unique features. You can find all community pipelines in the [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) folder with inference and training examples for how to use them.
+
+This section showcases a couple of the community pipelines and hopefully it'll inspire you to create your own (feel free to open a PR for your community pipeline and ping us for a review)!
+
+> [!TIP]
+> The [`~DiffusionPipeline.from_pipe`] method is particularly useful for loading community pipelines because many of them don't have pretrained weights and add a feature on top of an existing pipeline like Stable Diffusion or Stable Diffusion XL. You can learn more about the [`~DiffusionPipeline.from_pipe`] method in the [Load with from_pipe](custom_pipeline_overview#load-with-from_pipe) section.
+
+<hfoptions id="community">
+<hfoption id="Marigold">
+
+[Marigold](https://marigoldmonodepth.github.io/) is a depth estimation diffusion pipeline that uses the rich existing and inherent visual knowledge in diffusion models. It takes an input image and denoises and decodes it into a depth map. Marigold performs well even on images it hasn't seen before.
+
+```py
+import torch
+from PIL import Image
+from diffusers import DiffusionPipeline
+from diffusers.utils import load_image
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "prs-eth/marigold-lcm-v1-0",
+    custom_pipeline="marigold_depth_estimation",
+    torch_dtype=torch.float16,
+    variant="fp16",
+)
+
+pipeline.to("cuda")
+image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/community-marigold.png")
+output = pipeline(
+    image,
+    denoising_steps=4,
+    ensemble_size=5,
+    processing_res=768,
+    match_input_res=True,
+    batch_size=0,
+    seed=33,
+    color_map="Spectral",
+    show_progress_bar=True,
+)
+depth_colored: Image.Image = output.depth_colored
+depth_colored.save("./depth_colored.png")
+```
+
+<div class="flex flex-row gap-4">
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/community-marigold.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/marigold-depth.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">colorized depth image</figcaption>
+  </div>
+</div>
+
+</hfoption>
+<hfoption id="HD-Painter">
+
+[HD-Painter](https://hf.co/papers/2312.14091) is a high-resolution inpainting pipeline. It introduces a *Prompt-Aware Introverted Attention (PAIntA)* layer to better align a prompt with the area to be inpainted, and *Reweighting Attention Score Guidance (RASG)* to keep the latents more prompt-aligned and within their trained domain to generate realistc images.
+
+```py
+import torch
+from diffusers import DiffusionPipeline, DDIMScheduler
+from diffusers.utils import load_image
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "Lykon/dreamshaper-8-inpainting",
+    custom_pipeline="hd_painter"
+)
+pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter.jpg")
+mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter-mask.png")
+prompt = "football"
+image = pipeline(prompt, init_image, mask_image, use_rasg=True, use_painta=True, generator=torch.manual_seed(0)).images[0]
+image
+```
+
+<div class="flex flex-row gap-4">
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter.jpg"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter-output.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
+  </div>
+</div>
+
+</hfoption>
+</hfoptions>
+
 ## Community components
 
 Community components allow users to build pipelines that may have customized components that are not a part of Diffusers. If your pipeline has custom components that Diffusers doesn't already support, you need to provide their implementations as Python modules. These customized components could be a VAE, UNet, and scheduler. In most cases, the text encoder is imported from the Transformers library. The pipeline code itself can also be customized.
diff --git a/docs/source/en/using-diffusers/distilled_sd.md b/docs/source/en/using-diffusers/distilled_sd.md
deleted file mode 100644
index c4c5f7ad1998..000000000000
--- a/docs/source/en/using-diffusers/distilled_sd.md
+++ /dev/null
@@ -1,133 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Distilled Stable Diffusion inference
-
-[[open-in-colab]]
-
-Stable Diffusion inference can be a computationally intensive process because it must iteratively denoise the latents to generate an image. To reduce the computational burden, you can use a *distilled* version of the Stable Diffusion model from [Nota AI](https://huggingface.co/nota-ai). The distilled version of their Stable Diffusion model eliminates some of the residual and attention blocks from the UNet, reducing the model size by 51% and improving latency on CPU/GPU by 43%.
-
-<Tip>
-
-Read this [blog post](https://huggingface.co/blog/sd_distillation) to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model.
-
-</Tip>
-
-Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model:
-
-```py
-from diffusers import StableDiffusionPipeline
-import torch
-
-distilled = StableDiffusionPipeline.from_pretrained(
-    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
-).to("cuda")
-
-original = StableDiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16, use_safetensors=True,
-).to("cuda")
-```
-
-Given a prompt, get the inference time for the original model:
-
-```py
-import time
-
-seed = 2023
-generator = torch.manual_seed(seed)
-
-NUM_ITERS_TO_RUN = 3
-NUM_INFERENCE_STEPS = 25
-NUM_IMAGES_PER_PROMPT = 4
-
-prompt = "a golden vase with different flowers"
-
-start = time.time_ns()
-for _ in range(NUM_ITERS_TO_RUN):
-    images = original(
-        prompt,
-        num_inference_steps=NUM_INFERENCE_STEPS,
-        generator=generator,
-        num_images_per_prompt=NUM_IMAGES_PER_PROMPT
-    ).images
-end = time.time_ns()
-original_sd = f"{(end - start) / 1e6:.1f}"
-
-print(f"Execution time -- {original_sd} ms\n")
-"Execution time -- 45781.5 ms"
-```
-
-Time the distilled model inference:
-
-```py
-start = time.time_ns()
-for _ in range(NUM_ITERS_TO_RUN):
-    images = distilled(
-        prompt,
-        num_inference_steps=NUM_INFERENCE_STEPS,
-        generator=generator,
-        num_images_per_prompt=NUM_IMAGES_PER_PROMPT
-    ).images
-end = time.time_ns()
-
-distilled_sd = f"{(end - start) / 1e6:.1f}"
-print(f"Execution time -- {distilled_sd} ms\n")
-"Execution time -- 29884.2 ms"
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/original_sd.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">original Stable Diffusion (45781.5 ms)</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion (29884.2 ms)</figcaption>
-  </div>
-</div>
-
-## Tiny AutoEncoder
-
-To speed inference up even more, use a tiny distilled version of the [Stable Diffusion VAE](https://huggingface.co/sayakpaul/taesdxl-diffusers) to denoise the latents into images. Replace the VAE in the distilled Stable Diffusion model with the tiny VAE:
-
-```py
-from diffusers import AutoencoderTiny
-
-distilled.vae = AutoencoderTiny.from_pretrained(
-    "sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True,
-).to("cuda")
-```
-
-Time the distilled model and distilled VAE inference:
-
-```py
-start = time.time_ns()
-for _ in range(NUM_ITERS_TO_RUN):
-    images = distilled(
-        prompt,
-        num_inference_steps=NUM_INFERENCE_STEPS,
-        generator=generator,
-        num_images_per_prompt=NUM_IMAGES_PER_PROMPT
-    ).images
-end = time.time_ns()
-
-distilled_tiny_sd = f"{(end - start) / 1e6:.1f}"
-print(f"Execution time -- {distilled_tiny_sd} ms\n")
-"Execution time -- 27165.7 ms"
-```
-
-<div class="flex justify-center">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd_vae.png" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder (27165.7 ms)</figcaption>
-  </div>
-</div>
diff --git a/docs/source/en/using-diffusers/freeu.md b/docs/source/en/using-diffusers/freeu.md
deleted file mode 100644
index 7b1fb908cac9..000000000000
--- a/docs/source/en/using-diffusers/freeu.md
+++ /dev/null
@@ -1,135 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Improve generation quality with FreeU
-
-[[open-in-colab]]
-
-The UNet is responsible for denoising during the reverse diffusion process, and there are two distinct features in its architecture:
-
-1. Backbone features primarily contribute to the denoising process
-2. Skip features mainly introduce high-frequency features into the decoder module and can make the network overlook the semantics in the backbone features
-
-However, the skip connection can sometimes introduce unnatural image details. [FreeU](https://hf.co/papers/2309.11497) is a technique for improving image quality by rebalancing the contributions from the UNet’s skip connections and backbone feature maps.
-
-FreeU is applied during inference and it does not require any additional training. The technique works for different tasks such as text-to-image, image-to-image, and text-to-video.
-
-In this guide, you will apply FreeU to the [`StableDiffusionPipeline`], [`StableDiffusionXLPipeline`], and [`TextToVideoSDPipeline`]. You need to install Diffusers from source to run the examples below.
-
-## StableDiffusionPipeline
-
-Load the pipeline:
-
-```py
-from diffusers import DiffusionPipeline
-import torch
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, safety_checker=None
-).to("cuda")
-```
-
-Then enable the FreeU mechanism with the FreeU-specific hyperparameters. These values are scaling factors for the backbone and skip features.
-
-```py
-pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
-```
-
-The values above are from the official FreeU [code repository](https://github.com/ChenyangSi/FreeU) where you can also find [reference hyperparameters](https://github.com/ChenyangSi/FreeU#range-for-more-parameters) for different models.
-
-<Tip>
-
-Disable the FreeU mechanism by calling `disable_freeu()` on a pipeline.
-
-</Tip>
-
-And then run inference:
-
-```py
-prompt = "A squirrel eating a burger"
-seed = 2023
-image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0]
-image
-```
-
-The figure below compares non-FreeU and FreeU results respectively for the same hyperparameters used above (`prompt` and `seed`):
-
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/freeu/sdv1_5_freeu.jpg)
-
-
-Let's see how Stable Diffusion 2 results are impacted:
-
-```py
-from diffusers import DiffusionPipeline
-import torch
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16, safety_checker=None
-).to("cuda")
-
-prompt = "A squirrel eating a burger"
-seed = 2023
-
-pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.1, b2=1.2)
-image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0]
-image
-```
-
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/freeu/sdv2_1_freeu.jpg)
-
-## Stable Diffusion XL
-
-Finally, let's take a look at how FreeU affects Stable Diffusion XL results:
-
-```py
-from diffusers import DiffusionPipeline
-import torch
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16,
-).to("cuda")
-
-prompt = "A squirrel eating a burger"
-seed = 2023
-
-# Comes from
-# https://wandb.ai/nasirk24/UNET-FreeU-SDXL/reports/FreeU-SDXL-Optimal-Parameters--Vmlldzo1NDg4NTUw
-pipeline.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
-image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0]
-image
-```
-
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/freeu/sdxl_freeu.jpg)
-
-## Text-to-video generation
-
-FreeU can also be used to improve video quality:
-
-```python
-from diffusers import DiffusionPipeline
-from diffusers.utils import export_to_video
-import torch
-
-model_id = "cerspense/zeroscope_v2_576w"
-pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
-
-prompt = "an astronaut riding a horse on mars"
-seed = 2023
-
-# The values come from
-# https://github.com/lyn-rgb/FreeU_Diffusers#video-pipelines
-pipe.enable_freeu(b1=1.2, b2=1.4, s1=0.9, s2=0.2)
-video_frames = pipe(prompt, height=320, width=576, num_frames=30, generator=torch.manual_seed(seed)).frames[0]
-export_to_video(video_frames, "astronaut_rides_horse.mp4")
-```
-
-Thanks to [kadirnar](https://github.com/kadirnar/) for helping to integrate the feature, and to [justindujardin](https://github.com/justindujardin) for the helpful discussions.
diff --git a/docs/source/en/using-diffusers/image_quality.md b/docs/source/en/using-diffusers/image_quality.md
new file mode 100644
index 000000000000..8961f88b904d
--- /dev/null
+++ b/docs/source/en/using-diffusers/image_quality.md
@@ -0,0 +1,190 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Controlling image quality
+
+The components of a diffusion model, like the UNet and scheduler, can be optimized to improve the quality of generated images leading to better image lighting and details. These techniques are especially useful if you don't have the resources to simply use a larger model for inference. You can enable these techniques during inference without any additional training.
+
+This guide will show you how to turn these techniques on in your pipeline and how to configure them to improve the quality of your generated images.
+
+## Lighting
+
+The Stable Diffusion models aren't very good at generating images that are very bright or dark because the scheduler doesn't start sampling from the last timestep and it doesn't enforce a zero signal-to-noise ratio (SNR). The [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://hf.co/papers/2305.08891) paper fixes these issues which are now available in some Diffusers schedulers.
+
+> [!TIP]
+> For inference, you need a model that has been trained with *v_prediction*. To train your own model with *v_prediction*, add the following flag to the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts.
+>
+> ```bash
+> --prediction_type="v_prediction"
+> ```
+
+For example, load the [ptx0/pseudo-journey-v2](https://hf.co/ptx0/pseudo-journey-v2) checkpoint which was trained with `v_prediction` and the [`DDIMScheduler`]. Now you should configure the following parameters in the [`DDIMScheduler`].
+
+* `rescale_betas_zero_snr=True` to rescale the noise schedule to zero SNR
+* `timestep_spacing="trailing"` to start sampling from the last timestep
+
+Set `guidance_rescale` in the pipeline to prevent over-exposure. A lower value increases brightness but some of the details may appear washed out.
+
+```py
+from diffusers import DiffusionPipeline, DDIMScheduler
+
+pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
+
+pipeline.scheduler = DDIMScheduler.from_config(
+    pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
+)
+pipeline.to("cuda")
+prompt = "cinematic photo of a snowy mountain at night with the northern lights aurora borealis overhead, 35mm photograph, film, professional, 4k, highly detailed"
+generator = torch.Generator(device="cpu").manual_seed(23)
+image = pipeline(prompt, guidance_rescale=0.7, generator=generator).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/no-zero-snr.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">default Stable Diffusion v2-1 image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero-snr.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">image with zero SNR and trailing timestep spacing enabled</figcaption>
+  </div>
+</div>
+
+## Details
+
+[FreeU](https://hf.co/papers/2309.11497) improves image details by rebalancing the UNet's backbone and skip connection weights. The skip connections can cause the model to overlook some of the backbone semantics which may lead to unnatural image details in the generated image. This technique does not require any additional training and can be applied on the fly during inference for tasks like image-to-image and text-to-video.
+
+Use the [`~pipelines.StableDiffusionMixin.enable_freeu`] method on your pipeline and configure the scaling factors for the backbone (`b1` and `b2`) and skip connections (`s1` and `s2`). The number after each scaling factor corresponds to the stage in the UNet where the factor is applied. Take a look at the [FreeU](https://github.com/ChenyangSi/FreeU#parameters) repository for reference hyperparameters for different models.
+
+<hfoptions id="freeu">
+<hfoption id="Stable Diffusion v1-5">
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, safety_checker=None
+).to("cuda")
+pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.5, b2=1.6)
+generator = torch.Generator(device="cpu").manual_seed(33)
+prompt = ""
+image = pipeline(prompt, generator=generator).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv15-no-freeu.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv15-freeu.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
+  </div>
+</div>
+
+</hfoption>
+<hfoption id="Stable Diffusion v2-1">
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16, safety_checker=None
+).to("cuda")
+pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.4, b2=1.6)
+generator = torch.Generator(device="cpu").manual_seed(80)
+prompt = "A squirrel eating a burger"
+image = pipeline(prompt, generator=generator).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv21-no-freeu.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdv21-freeu.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
+  </div>
+</div>
+
+</hfoption>
+<hfoption id="Stable Diffusion XL">
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16,
+).to("cuda")
+pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.3, b2=1.4)
+generator = torch.Generator(device="cpu").manual_seed(13)
+prompt = "A squirrel eating a burger"
+image = pipeline(prompt, generator=generator).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-no-freeu.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-freeu.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
+  </div>
+</div>
+
+</hfoption>
+<hfoption id="Zeroscope">
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+from diffusers.utils import export_to_video
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16
+).to("cuda")
+# values come from https://github.com/lyn-rgb/FreeU_Diffusers#video-pipelines
+pipeline.enable_freeu(b1=1.2, b2=1.4, s1=0.9, s2=0.2)
+prompt = "Confident teddy bear surfer rides the wave in the tropics"
+generator = torch.Generator(device="cpu").manual_seed(47)
+video_frames = pipeline(prompt, generator=generator).frames[0]
+export_to_video(video_frames, "teddy_bear.mp4", fps=10)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/video-no-freeu.gif"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU disabled</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/video-freeu.gif"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">FreeU enabled</figcaption>
+  </div>
+</div>
+
+</hfoption>
+</hfoptions>
+
+Call the [`pipelines.StableDiffusionMixin.disable_freeu`] method to disable FreeU.
+
+```py
+pipeline.disable_freeu()
+```
diff --git a/docs/source/en/using-diffusers/inference_with_lcm.md b/docs/source/en/using-diffusers/inference_with_lcm.md
index 798de67c6560..19fb349c5458 100644
--- a/docs/source/en/using-diffusers/inference_with_lcm.md
+++ b/docs/source/en/using-diffusers/inference_with_lcm.md
@@ -10,29 +10,30 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
+# Latent Consistency Model
+
 [[open-in-colab]]
 
-# Latent Consistency Model
+[Latent Consistency Models (LCMs)](https://hf.co/papers/2310.04378) enable fast high-quality image generation by directly predicting the reverse diffusion process in the latent rather than pixel space. In other words, LCMs try to predict the noiseless image from the noisy image in contrast to typical diffusion models that iteratively remove noise from the noisy image. By avoiding the iterative sampling process, LCMs are able to generate high-quality images in 2-4 steps instead of 20-30 steps.
 
-Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. 
+LCMs are distilled from pretrained models which requires ~32 hours of A100 compute. To speed this up, [LCM-LoRAs](https://hf.co/papers/2311.05556) train a [LoRA adapter](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) which have much fewer parameters to train compared to the full model. The LCM-LoRA can be plugged into a diffusion model once it has been trained.
 
-From the [official website](https://latent-consistency-models.github.io/):
+This guide will show you how to use LCMs and LCM-LoRAs for fast inference on tasks and how to use them with other adapters like ControlNet or T2I-Adapter.
 
-> LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations.
+> [!TIP]
+> LCMs and LCM-LoRAs are available for Stable Diffusion v1.5, Stable Diffusion XL, and the SSD-1B model. You can find their checkpoints on the [Latent Consistency](https://hf.co/collections/latent-consistency/latent-consistency-models-weights-654ce61a95edd6dffccef6a8) Collections.
 
-For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378).
+## Text-to-image
 
-LCM distilled models are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-weights-654ce61a95edd6dffccef6a8).
+<hfoptions id="lcm-text2img">
+<hfoption id="LCM">
 
-This guide shows how to perform inference with LCMs for 
-- text-to-image
-- image-to-image
-- combined with style LoRAs
-- ControlNet/T2I-Adapter
+To use LCMs, you need to load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps.
 
-## Text-to-image
+A couple of notes to keep in mind when using LCMs are:
 
-You'll use the [`StableDiffusionXLPipeline`] pipeline with the [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow, overcoming the slow iterative nature of diffusion models.
+* Typically, batch size is doubled inside the pipeline for classifier-free guidance. But LCM applies guidance with guidance embeddings and doesn't need to double the batch size, which leads to faster inference. The downside is that negative prompts don't work with LCM because they don't have any effect on the denoising process.
+* The ideal range for `guidance_scale` is [3., 13.] because that is what the UNet was trained with. However, disabling `guidance_scale` with a value of 1.0 is also effective in most cases.
 
 ```python
 from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler
@@ -49,31 +50,69 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 
 prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
-
 generator = torch.manual_seed(0)
 image = pipe(
     prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0
 ).images[0]
+image
 ```
 
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2i.png)
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2i.png"/>
+</div>
 
-Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL.
+</hfoption>
+<hfoption id="LCM-LoRA">
+
+To use LCM-LoRAs, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps.
+
+A couple of notes to keep in mind when using LCM-LoRAs are:
+
+* Typically, batch size is doubled inside the pipeline for classifier-free guidance. But LCM applies guidance with guidance embeddings and doesn't need to double the batch size, which leads to faster inference. The downside is that negative prompts don't work with LCM because they don't have any effect on the denoising process.
+* You could use guidance with LCM-LoRAs, but it is very sensitive to high `guidance_scale` values and can lead to artifacts in the generated image. The best values we've found are between [1.0, 2.0].
+* Replace [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0) with any finetuned model. For example, try using the [animagine-xl](https://huggingface.co/Linaqruf/animagine-xl) checkpoint to generate anime images with SDXL.
+
+```py
+import torch
+from diffusers import DiffusionPipeline, LCMScheduler
+
+pipe = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    variant="fp16",
+    torch_dtype=torch.float16
+).to("cuda")
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
 
-Some details to keep in mind:
+prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
+generator = torch.manual_seed(42)
+image = pipe(
+    prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0
+).images[0]
+image
+```
 
-* To perform classifier-free guidance, batch size is usually doubled inside the pipeline. LCM, however, applies guidance using guidance embeddings, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process.
-* The UNet was trained using the [3., 13.] guidance scale range. So, that is the ideal range for `guidance_scale`. However, disabling `guidance_scale` using a value of 1.0 is also effective in most cases.
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i.png"/>
+</div>
 
+</hfoption>
+</hfoptions>
 
 ## Image-to-image
 
-LCMs can be applied to image-to-image tasks too. For this example, we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model, but the same steps can be applied to other LCM models as well.
+<hfoptions id="lcm-img2img">
+<hfoption id="LCM">
+
+To use LCMs for image-to-image, you need to load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps.
+
+> [!TIP]
+> Experiment with different values for `num_inference_steps`, `strength`, and `guidance_scale` to get the best results.
 
 ```python
 import torch
 from diffusers import AutoPipelineForImage2Image, UNet2DConditionModel, LCMScheduler
-from diffusers.utils import make_image_grid, load_image
+from diffusers.utils import load_image
 
 unet = UNet2DConditionModel.from_pretrained(
     "SimianLuo/LCM_Dreamshaper_v7",
@@ -89,12 +128,8 @@ pipe = AutoPipelineForImage2Image.from_pretrained(
 ).to("cuda")
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 
-# prepare image
-url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
-init_image = load_image(url)
+init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png")
 prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
-
-# pass prompt and image to pipeline
 generator = torch.manual_seed(0)
 image = pipe(
     prompt,
@@ -104,22 +139,130 @@ image = pipe(
     strength=0.5,
     generator=generator
 ).images[0]
-make_image_grid([init_image, image], rows=1, cols=2)
+image
 ```
 
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_i2i.png)
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-img2img.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
+  </div>
+</div>
+
+</hfoption>
+<hfoption id="LCM-LoRA">
 
+To use LCM-LoRAs for image-to-image, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps.
 
-<Tip>
+> [!TIP]
+> Experiment with different values for `num_inference_steps`, `strength`, and `guidance_scale` to get the best results.
 
-You can get different results based on your prompt and the image you provide. To get the best results, we recommend trying different values for `num_inference_steps`, `strength`, and `guidance_scale` parameters and choose the best one.
+```py
+import torch
+from diffusers import AutoPipelineForImage2Image, LCMScheduler
+from diffusers.utils import make_image_grid, load_image
+
+pipe = AutoPipelineForImage2Image.from_pretrained(
+    "Lykon/dreamshaper-7",
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda")
+
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
+
+init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png")
+prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
+
+generator = torch.manual_seed(0)
+image = pipe(
+    prompt,
+    image=init_image,
+    num_inference_steps=4,
+    guidance_scale=1,
+    strength=0.6,
+    generator=generator
+).images[0]
+image
+```
 
-</Tip>
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-img2img.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
+  </div>
+</div>
 
+</hfoption>
+</hfoptions>
 
-## Combine with style LoRAs
+## Inpainting
 
-LCMs can be used with other styled LoRAs to generate styled-images in very few steps (4-8). In the following example, we'll use the [papercut LoRA](TheLastBen/Papercut_SDXL). 
+To use LCM-LoRAs for inpainting, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt, initial image, and mask image to generate an image in just 4 steps.
+
+```py
+import torch
+from diffusers import AutoPipelineForInpainting, LCMScheduler
+from diffusers.utils import load_image, make_image_grid
+
+pipe = AutoPipelineForInpainting.from_pretrained(
+    "runwayml/stable-diffusion-inpainting",
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda")
+
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
+
+init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
+mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
+
+prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
+generator = torch.manual_seed(0)
+image = pipe(
+    prompt=prompt,
+    image=init_image,
+    mask_image=mask_image,
+    generator=generator,
+    num_inference_steps=4,
+    guidance_scale=4, 
+).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-inpaint.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
+  </div>
+</div>
+
+## Adapters
+
+LCMs are compatible with adapters like LoRA, ControlNet, T2I-Adapter, and AnimateDiff. You can bring the speed of LCMs to these adapters to generate images in a certain style or condition the model on another input like a canny image.
+
+### LoRA
+
+[LoRA](../using-diffusers/loading_adapters#lora) adapters can be rapidly finetuned to learn a new style from just a few images and plugged into a pretrained model to generate images in that style.
+
+<hfoptions id="lcm-lora">
+<hfoption id="LCM">
+
+Load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LoRA weights into the LCM and generate a styled image in a few steps.
 
 ```python
 from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler
@@ -134,11 +277,9 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
     "stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16, variant="fp16",
 ).to("cuda")
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
 pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut")
 
 prompt = "papercut, a cute fox"
-
 generator = torch.manual_seed(0)
 image = pipe(
     prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0
@@ -146,15 +287,58 @@ image = pipe(
 image
 ```
 
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdx_lora_mix.png)
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdx_lora_mix.png"/>
+</div>
 
+</hfoption>
+<hfoption id="LCM-LoRA">
 
-## ControlNet/T2I-Adapter
+Replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights and the style LoRA you want to use. Combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method and generate a styled image in a few steps.
 
-Let's look at how we can perform inference with ControlNet/T2I-Adapter and a LCM. 
+```py
+import torch
+from diffusers import DiffusionPipeline, LCMScheduler
+
+pipe = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    variant="fp16",
+    torch_dtype=torch.float16
+).to("cuda")
+
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl", adapter_name="lcm")
+pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut")
+
+pipe.set_adapters(["lcm", "papercut"], adapter_weights=[1.0, 0.8])
+
+prompt = "papercut, a cute fox"
+generator = torch.manual_seed(0)
+image = pipe(prompt, num_inference_steps=4, guidance_scale=1, generator=generator).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdx_lora_mix.png"/>
+</div>
+
+</hfoption>
+</hfoptions>
 
 ### ControlNet
-For this example, we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model with canny ControlNet, but the same steps can be applied to other LCM models as well.
+
+[ControlNet](./controlnet) are adapters that can be trained on a variety of inputs like canny edge, pose estimation, or depth. The ControlNet can be inserted into the pipeline to provide additional conditioning and control to the model for more accurate generation.
+
+You can find additional ControlNet models trained on other inputs in [lllyasviel's](https://hf.co/lllyasviel) repository.
+
+<hfoptions id="lcm-controlnet">
+<hfoption id="LCM">
+
+Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a LCM model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Now pass the canny image to the pipeline and generate an image.
+
+> [!TIP]
+> Experiment with different values for `num_inference_steps`, `controlnet_conditioning_scale`, `cross_attention_kwargs`, and `guidance_scale` to get the best results.
 
 ```python
 import torch
@@ -186,8 +370,6 @@ pipe = StableDiffusionControlNetPipeline.from_pretrained(
     torch_dtype=torch.float16,
     safety_checker=None,
 ).to("cuda")
-
-# set scheduler
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 
 generator = torch.manual_seed(0)
@@ -200,16 +382,84 @@ image = pipe(
 make_image_grid([canny_image, image], rows=1, cols=2)
 ```
 
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_controlnet.png)
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_controlnet.png"/>
+</div>
 
+</hfoption>
+<hfoption id="LCM-LoRA">
 
-<Tip>
-The inference parameters in this example might not work for all examples, so we recommend trying different values for the `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale`, and `cross_attention_kwargs` parameters and choosing the best one. 
-</Tip>
+Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a Stable Diffusion v1.5 model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights, and pass the canny image to the pipeline and generate an image.
+
+> [!TIP]
+> Experiment with different values for `num_inference_steps`, `controlnet_conditioning_scale`, `cross_attention_kwargs`, and `guidance_scale` to get the best results.
+
+```py
+import torch
+import cv2
+import numpy as np
+from PIL import Image
+
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler
+from diffusers.utils import load_image
+
+image = load_image(
+    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+).resize((512, 512))
+
+image = np.array(image)
+
+low_threshold = 100
+high_threshold = 200
+
+image = cv2.Canny(image, low_threshold, high_threshold)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+pipe = StableDiffusionControlNetPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    controlnet=controlnet,
+    torch_dtype=torch.float16,
+    safety_checker=None,
+    variant="fp16"
+).to("cuda")
+
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
+
+generator = torch.manual_seed(0)
+image = pipe(
+    "the mona lisa",
+    image=canny_image,
+    num_inference_steps=4,
+    guidance_scale=1.5,
+    controlnet_conditioning_scale=0.8,
+    cross_attention_kwargs={"scale": 1},
+    generator=generator,
+).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_controlnet.png"/>
+</div>
+
+</hfoption>
+</hfoptions>
 
 ### T2I-Adapter
 
-This example shows how to use the `lcm-sdxl` with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0).
+[T2I-Adapter](./t2i_adapter) is an even more lightweight adapter than ControlNet, that provides an additional input to condition a pretrained model with. It is faster than ControlNet but the results may be slightly worse.
+
+You can find additional T2I-Adapter checkpoints trained on other inputs in [TencentArc's](https://hf.co/TencentARC) repository.
+
+<hfoptions id="lcm-t2i">
+<hfoption id="LCM">
+
+Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Then load a LCM checkpoint into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Now pass the canny image to the pipeline and generate an image.
 
 ```python
 import torch
@@ -220,10 +470,9 @@ from PIL import Image
 from diffusers import StableDiffusionXLAdapterPipeline, UNet2DConditionModel, T2IAdapter, LCMScheduler
 from diffusers.utils import load_image, make_image_grid
 
-# Prepare image
-# Detect the canny map in low resolution to avoid high-frequency details
+# detect the canny map in low resolution to avoid high-frequency details
 image = load_image(
-    "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg"
+    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
 ).resize((384, 384))
 
 image = np.array(image)
@@ -236,7 +485,6 @@ image = image[:, :, None]
 image = np.concatenate([image, image, image], axis=2)
 canny_image = Image.fromarray(image).resize((1024, 1216))
 
-# load adapter
 adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
 
 unet = UNet2DConditionModel.from_pretrained(
@@ -254,7 +502,7 @@ pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
 
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 
-prompt = "Mystical fairy in real, magic, 4k picture, high quality"
+prompt = "the mona lisa, 4k picture, high quality"
 negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured"
 
 generator = torch.manual_seed(0)
@@ -268,7 +516,116 @@ image = pipe(
     adapter_conditioning_factor=1,
     generator=generator,
 ).images[0]
-grid = make_image_grid([canny_image, image], rows=1, cols=2)
 ```
 
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2iadapter.png)
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-t2i.png"/>
+</div>
+
+</hfoption>
+<hfoption id="LCM-LoRA">
+
+Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Replace the scheduler with the [`LCMScheduler`], and use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights. Pass the canny image to the pipeline and generate an image.
+
+```py
+import torch
+import cv2
+import numpy as np
+from PIL import Image
+
+from diffusers import StableDiffusionXLAdapterPipeline, UNet2DConditionModel, T2IAdapter, LCMScheduler
+from diffusers.utils import load_image, make_image_grid
+
+# detect the canny map in low resolution to avoid high-frequency details
+image = load_image(
+    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+).resize((384, 384))
+
+image = np.array(image)
+
+low_threshold = 100
+high_threshold = 200
+
+image = cv2.Canny(image, low_threshold, high_threshold)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image).resize((1024, 1024))
+
+adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
+
+pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", 
+    adapter=adapter,
+    torch_dtype=torch.float16,
+    variant="fp16", 
+).to("cuda")
+
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
+
+prompt = "the mona lisa, 4k picture, high quality"
+negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured"
+
+generator = torch.manual_seed(0)
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    image=canny_image,
+    num_inference_steps=4,
+    guidance_scale=1.5, 
+    adapter_conditioning_scale=0.8, 
+    adapter_conditioning_factor=1,
+    generator=generator,
+).images[0]
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-t2i.png"/>
+</div>
+
+</hfoption>
+</hfoptions>
+
+### AnimateDiff
+
+[AnimateDiff](../api/pipelines/animatediff) is an adapter that adds motion to an image. It can be used with most Stable Diffusion models, effectively turning them into "video generation" models. Generating good results with a video model usually requires generating multiple frames (16-24), which can be very slow with a regular Stable Diffusion model. LCM-LoRA can speed up this process by only taking 4-8 steps for each frame.
+
+Load a [`AnimateDiffPipeline`] and pass a [`MotionAdapter`] to it. Then replace the scheduler with the [`LCMScheduler`], and combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method. Now you can pass a prompt to the pipeline and generate an animated image.
+
+```py
+import torch
+from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler, LCMScheduler
+from diffusers.utils import export_to_gif
+
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5")
+pipe = AnimateDiffPipeline.from_pretrained(
+    "frankjoshua/toonyou_beta6",
+    motion_adapter=adapter,
+).to("cuda")
+
+# set scheduler
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
+# load LCM-LoRA
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5", adapter_name="lcm")
+pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora")
+
+pipe.set_adapters(["lcm", "motion-lora"], adapter_weights=[0.55, 1.2])
+
+prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
+generator = torch.manual_seed(0)
+frames = pipe(
+    prompt=prompt,
+    num_inference_steps=5,
+    guidance_scale=1.25,
+    cross_attention_kwargs={"scale": 1},
+    num_frames=24,
+    generator=generator
+).frames[0]
+export_to_gif(frames, "animation.gif")
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm-lora-animatediff.gif"/>
+</div>
diff --git a/docs/source/en/using-diffusers/inference_with_lcm_lora.md b/docs/source/en/using-diffusers/inference_with_lcm_lora.md
deleted file mode 100644
index 36120a04828c..000000000000
--- a/docs/source/en/using-diffusers/inference_with_lcm_lora.md
+++ /dev/null
@@ -1,422 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-[[open-in-colab]]
-
-# Performing inference with LCM-LoRA
-
-Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. 
-
-From the [official website](https://latent-consistency-models.github.io/):
-
-> LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations.
-
-For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378).
-
-However, each model needs to be distilled separately for latent consistency distillation. The core idea with LCM-LoRA is to train just a few adapter layers, the adapter being LoRA in this case. 
-This way, we don't have to train the full model and keep the number of trainable parameters manageable. The resulting LoRAs can then be applied to any fine-tuned version of the model without distilling them separately.
-Additionally, the LoRAs can be applied to image-to-image, ControlNet/T2I-Adapter, inpainting, AnimateDiff etc. 
-The LCM-LoRA can also be combined with other LoRAs to generate styled images in very few steps (4-8).
-
-LCM-LoRAs are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-loras-654cdd24e111e16f0865fba6).
-
-For more details about LCM-LoRA, refer to [the technical report](https://huggingface.co/papers/2311.05556).
-
-This guide shows how to perform inference with LCM-LoRAs for 
-- text-to-image
-- image-to-image
-- combined with styled LoRAs
-- ControlNet/T2I-Adapter
-- inpainting
-- AnimateDiff
-
-Before going through this guide, we'll take a look at the general workflow for performing inference with LCM-LoRAs.
-LCM-LoRAs are similar to other Stable Diffusion LoRAs so they can be used with any [`DiffusionPipeline`] that supports LoRAs.
-
-- Load the task specific pipeline and model.
-- Set the scheduler to [`LCMScheduler`].
-- Load the LCM-LoRA weights for the model.
-- Reduce the `guidance_scale` between `[1.0, 2.0]` and set the `num_inference_steps` between [4, 8].
-- Perform inference with the pipeline with the usual parameters.
-
-Let's look at how we can perform inference with LCM-LoRAs for different tasks.
-
-First, make sure you have [peft](https://github.com/huggingface/peft) installed, for better LoRA support.
-
-```bash
-pip install -U peft
-```
-
-## Text-to-image
-
-You'll use the [`StableDiffusionXLPipeline`] with the scheduler: [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow overcoming the slow iterative nature of diffusion models.
-
-```python
-import torch
-from diffusers import DiffusionPipeline, LCMScheduler
-
-pipe = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    variant="fp16",
-    torch_dtype=torch.float16
-).to("cuda")
-
-# set scheduler
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-# load LCM-LoRA
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
-
-prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
-
-generator = torch.manual_seed(42)
-image = pipe(
-    prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0
-).images[0]
-```
-
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i.png)
-
-Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL.
-
-<Tip>
-
-You may have noticed that we set `guidance_scale=1.0`, which disables classifer-free-guidance. This is because the LCM-LoRA is trained with guidance, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process.
-
-You can also use guidance with LCM-LoRA, but due to the nature of training the model is very sensitve to the `guidance_scale` values, high values can lead to artifacts in the generated images. In our experiments, we found that the best values are in the range of [1.0, 2.0].
-
-</Tip>
-
-### Inference with a fine-tuned model
-
-As mentioned above, the LCM-LoRA can be applied to any fine-tuned version of the model without having to distill them separately. Let's look at how we can perform inference with a fine-tuned model. In this example, we'll use the [animagine-xl](https://huggingface.co/Linaqruf/animagine-xl) model, which is a fine-tuned version of the SDXL model for generating anime.
-
-```python
-from diffusers import DiffusionPipeline, LCMScheduler
-
-pipe = DiffusionPipeline.from_pretrained(
-    "Linaqruf/animagine-xl",
-    variant="fp16",
-    torch_dtype=torch.float16
-).to("cuda")
-
-# set scheduler
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-# load LCM-LoRA
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
-
-prompt = "face focus, cute, masterpiece, best quality, 1girl, green hair, sweater, looking at viewer, upper body, beanie, outdoors, night, turtleneck"
-
-generator = torch.manual_seed(0)
-image = pipe(
-    prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0
-).images[0]
-```
-
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i_finetuned.png)
-
-
-## Image-to-image
-
-LCM-LoRA can be applied to image-to-image tasks too. Let's look at how we can perform image-to-image generation with LCMs. For this example we'll use the [dreamshaper-7](https://huggingface.co/Lykon/dreamshaper-7) model and the LCM-LoRA for `stable-diffusion-v1-5 `.
-
-```python
-import torch
-from diffusers import AutoPipelineForImage2Image, LCMScheduler
-from diffusers.utils import make_image_grid, load_image
-
-pipe = AutoPipelineForImage2Image.from_pretrained(
-    "Lykon/dreamshaper-7",
-    torch_dtype=torch.float16,
-    variant="fp16",
-).to("cuda")
-
-# set scheduler
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-# load LCM-LoRA
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
-
-# prepare image
-url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
-init_image = load_image(url)
-prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
-
-# pass prompt and image to pipeline
-generator = torch.manual_seed(0)
-image = pipe(
-    prompt,
-    image=init_image,
-    num_inference_steps=4,
-    guidance_scale=1,
-    strength=0.6,
-    generator=generator
-).images[0]
-make_image_grid([init_image, image], rows=1, cols=2)
-```
-
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_i2i.png)
-
-
-<Tip>
-
-You can get different results based on your prompt and the image you provide. To get the best results, we recommend trying different values for `num_inference_steps`, `strength`, and `guidance_scale` parameters and choose the best one.
-
-</Tip>
-
-
-## Combine with styled LoRAs
-
-LCM-LoRA can be combined with other LoRAs to generate styled-images in very few steps (4-8). In the following example, we'll use the LCM-LoRA with the [papercut LoRA](TheLastBen/Papercut_SDXL). 
-To learn more about how to combine LoRAs, refer to [this guide](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#combine-multiple-adapters).
-
-```python
-import torch
-from diffusers import DiffusionPipeline, LCMScheduler
-
-pipe = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    variant="fp16",
-    torch_dtype=torch.float16
-).to("cuda")
-
-# set scheduler
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-# load LoRAs
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl", adapter_name="lcm")
-pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut")
-
-# Combine LoRAs
-pipe.set_adapters(["lcm", "papercut"], adapter_weights=[1.0, 0.8])
-
-prompt = "papercut, a cute fox"
-generator = torch.manual_seed(0)
-image = pipe(prompt, num_inference_steps=4, guidance_scale=1, generator=generator).images[0]
-image
-```
-
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdx_lora_mix.png)
-
-
-## ControlNet/T2I-Adapter
-
-Let's look at how we can perform inference with ControlNet/T2I-Adapter and LCM-LoRA. 
-
-### ControlNet
-For this example, we'll use the SD-v1-5 model and the LCM-LoRA for SD-v1-5 with canny ControlNet.
-
-```python
-import torch
-import cv2
-import numpy as np
-from PIL import Image
-
-from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler
-from diffusers.utils import load_image
-
-image = load_image(
-    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
-).resize((512, 512))
-
-image = np.array(image)
-
-low_threshold = 100
-high_threshold = 200
-
-image = cv2.Canny(image, low_threshold, high_threshold)
-image = image[:, :, None]
-image = np.concatenate([image, image, image], axis=2)
-canny_image = Image.fromarray(image)
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    controlnet=controlnet,
-    torch_dtype=torch.float16,
-    safety_checker=None,
-    variant="fp16"
-).to("cuda")
-
-# set scheduler
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-# load LCM-LoRA
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
-
-generator = torch.manual_seed(0)
-image = pipe(
-    "the mona lisa",
-    image=canny_image,
-    num_inference_steps=4,
-    guidance_scale=1.5,
-    controlnet_conditioning_scale=0.8,
-    cross_attention_kwargs={"scale": 1},
-    generator=generator,
-).images[0]
-make_image_grid([canny_image, image], rows=1, cols=2)
-```
-
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_controlnet.png)
-
-
-<Tip>
-The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. 
-</Tip>
-
-### T2I-Adapter
-
-This example shows how to use the LCM-LoRA with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0) and SDXL.
-
-```python
-import torch
-import cv2
-import numpy as np
-from PIL import Image
-
-from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, LCMScheduler
-from diffusers.utils import load_image, make_image_grid
-
-# Prepare image
-# Detect the canny map in low resolution to avoid high-frequency details
-image = load_image(
-    "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg"
-).resize((384, 384))
-
-image = np.array(image)
-
-low_threshold = 100
-high_threshold = 200
-
-image = cv2.Canny(image, low_threshold, high_threshold)
-image = image[:, :, None]
-image = np.concatenate([image, image, image], axis=2)
-canny_image = Image.fromarray(image).resize((1024, 1024))
-
-# load adapter
-adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
-
-pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", 
-    adapter=adapter,
-    torch_dtype=torch.float16,
-    variant="fp16", 
-).to("cuda")
-
-# set scheduler
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-# load LCM-LoRA
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
-
-prompt = "Mystical fairy in real, magic, 4k picture, high quality"
-negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured"
-
-generator = torch.manual_seed(0)
-image = pipe(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    image=canny_image,
-    num_inference_steps=4,
-    guidance_scale=1.5, 
-    adapter_conditioning_scale=0.8, 
-    adapter_conditioning_factor=1,
-    generator=generator,
-).images[0]
-make_image_grid([canny_image, image], rows=1, cols=2)
-```
-
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2iadapter.png)
-
-
-## Inpainting
-
-LCM-LoRA can be used for inpainting as well. 
-
-```python
-import torch
-from diffusers import AutoPipelineForInpainting, LCMScheduler
-from diffusers.utils import load_image, make_image_grid
-
-pipe = AutoPipelineForInpainting.from_pretrained(
-    "runwayml/stable-diffusion-inpainting",
-    torch_dtype=torch.float16,
-    variant="fp16",
-).to("cuda")
-
-# set scheduler
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-# load LCM-LoRA
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5")
-
-# load base and mask image
-init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
-mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png")
-
-# generator = torch.Generator("cuda").manual_seed(92)
-prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k"
-generator = torch.manual_seed(0)
-image = pipe(
-    prompt=prompt,
-    image=init_image,
-    mask_image=mask_image,
-    generator=generator,
-    num_inference_steps=4,
-    guidance_scale=4, 
-).images[0]
-make_image_grid([init_image, mask_image, image], rows=1, cols=3)
-```
-
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_inpainting.png)
-
-
-## AnimateDiff
-
-[`AnimateDiff`] allows you to animate images using Stable Diffusion models. To get good results, we need to generate multiple frames (16-24), and doing this with standard SD models can be very slow. 
-LCM-LoRA can be used to speed up the process significantly, as you just need to do 4-8 steps for each frame. Let's look at how we can perform animation with LCM-LoRA and AnimateDiff.
-
-```python
-import torch
-from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler, LCMScheduler
-from diffusers.utils import export_to_gif
-
-adapter = MotionAdapter.from_pretrained("diffusers/animatediff-motion-adapter-v1-5")
-pipe = AnimateDiffPipeline.from_pretrained(
-    "frankjoshua/toonyou_beta6",
-    motion_adapter=adapter,
-).to("cuda")
-
-# set scheduler
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-
-# load LCM-LoRA
-pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5", adapter_name="lcm")
-pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora")
-
-pipe.set_adapters(["lcm", "motion-lora"], adapter_weights=[0.55, 1.2])
-
-prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
-generator = torch.manual_seed(0)
-frames = pipe(
-    prompt=prompt,
-    num_inference_steps=5,
-    guidance_scale=1.25,
-    cross_attention_kwargs={"scale": 1},
-    num_frames=24,
-    generator=generator
-).frames[0]
-export_to_gif(frames, "animation.gif")
-```
-
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_animatediff.gif)
\ No newline at end of file
diff --git a/docs/source/en/using-diffusers/ip_adapter.md b/docs/source/en/using-diffusers/ip_adapter.md
index e3c4178a1507..02fb0c34aa79 100644
--- a/docs/source/en/using-diffusers/ip_adapter.md
+++ b/docs/source/en/using-diffusers/ip_adapter.md
@@ -277,7 +277,7 @@ images = pipeline(
 
 ### IP-Adapter masking
 
-Binary masks specify which portion of the output image should be assigned to an IP-Adapter. This is useful for composing more than one IP-Adapter image. For each input IP-Adapter image, you must provide a binary mask an an IP-Adapter.
+Binary masks specify which portion of the output image should be assigned to an IP-Adapter. This is useful for composing more than one IP-Adapter image. For each input IP-Adapter image, you must provide a binary mask.
 
 To start, preprocess the input IP-Adapter images with the [`~image_processor.IPAdapterMaskProcessor.preprocess()`] to generate their masks. For optimal results, provide the output height and width to [`~image_processor.IPAdapterMaskProcessor.preprocess()`]. This ensures masks with different aspect ratios are appropriately stretched. If the input masks already match the aspect ratio of the generated image, you don't have to set the `height` and `width`.
 
@@ -305,13 +305,18 @@ masks = processor.preprocess([mask1, mask2], height=output_height, width=output_
   </div>
 </div>
 
-When there is more than one input IP-Adapter image, load them as a list to ensure each image is assigned to a different IP-Adapter. Each of the input IP-Adapter images here correspond to the masks generated above.
+When there is more than one input IP-Adapter image, load them as a list and provide the IP-Adapter scale list. Each of the input IP-Adapter images here corresponds to one of the masks generated above.
 
 ```py
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"])
+pipeline.set_ip_adapter_scale([[0.7, 0.7]])  # one scale for each image-mask pair
+
 face_image1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl1.png")
 face_image2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl2.png")
 
-ip_images = [[face_image1], [face_image2]]
+ip_images = [[face_image1, face_image2]]
+
+masks = [masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])]
 ```
 
 <div class="flex flex-row gap-4">
@@ -328,8 +333,6 @@ ip_images = [[face_image1], [face_image2]]
 Now pass the preprocessed masks to `cross_attention_kwargs` in the pipeline call.
 
 ```py
-pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] * 2)
-pipeline.set_ip_adapter_scale([0.7] * 2)
 generator = torch.Generator(device="cpu").manual_seed(0)
 num_images = 1
 
@@ -436,7 +439,7 @@ image = torch.from_numpy(faces[0].normed_embedding)
 ref_images_embeds.append(image.unsqueeze(0))
 ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
 neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
-id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda"))
+id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")
 
 generator = torch.Generator(device="cpu").manual_seed(42)
 
@@ -452,13 +455,28 @@ images = pipeline(
 Both IP-Adapter FaceID Plus and Plus v2 models require CLIP image embeddings. You can prepare face embeddings as shown previously, then you can extract and pass CLIP embeddings to the hidden image projection layers.
 
 ```py
-clip_embeds = pipeline.prepare_ip_adapter_image_embeds([ip_adapter_images], None, torch.device("cuda"), num_images, True)[0]
+from insightface.utils import face_align
+
+ref_images_embeds = []
+ip_adapter_images = []
+app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+app.prepare(ctx_id=0, det_size=(640, 640))
+image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
+faces = app.get(image)
+ip_adapter_images.append(face_align.norm_crop(image, landmark=faces[0].kps, image_size=224))
+image = torch.from_numpy(faces[0].normed_embedding)
+ref_images_embeds.append(image.unsqueeze(0))
+ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
+neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
+id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")
+
+clip_embeds = pipeline.prepare_ip_adapter_image_embeds(
+  [ip_adapter_images], None, torch.device("cuda"), num_images, True)[0]
 
 pipeline.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16)
 pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = False # True if Plus v2
 ```
 
-
 ### Multi IP-Adapter
 
 More than one IP-Adapter can be used at the same time to generate specific images in more diverse styles. For example, you can use IP-Adapter-Face to generate consistent faces and characters, and IP-Adapter Plus to generate those faces in a specific style.
@@ -643,16 +661,16 @@ image
 
 ### Style & layout control
 
-[InstantStyle](https://arxiv.org/abs/2404.02733) is a plug-and-play method on top of IP-Adapter, which disentangles style and layout from image prompt to control image generation. This is achieved by only inserting IP-Adapters to some specific part of the model. 
+[InstantStyle](https://arxiv.org/abs/2404.02733) is a plug-and-play method on top of IP-Adapter, which disentangles style and layout from image prompt to control image generation. This way, you can generate images following only the style or layout from image prompt, with significantly improved diversity. This is achieved by only activating IP-Adapters to specific parts of the model. 
 
 By default IP-Adapters are inserted to all layers of the model. Use the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method with a dictionary to assign scales to IP-Adapter at different layers.
 
 ```py
-from diffusers import AutoPipelineForImage2Image
+from diffusers import AutoPipelineForText2Image
 from diffusers.utils import load_image
 import torch
 
-pipeline = AutoPipelineForImage2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
+pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
 pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
 
 scale = {
@@ -662,15 +680,15 @@ scale = {
 pipeline.set_ip_adapter_scale(scale)
 ```
 
-This will activate IP-Adapter at the second layer in the model's down-part block 2 and up-part block 0. The former is the layer where IP-Adapter injects layout information and the latter injects style. Inserting IP-Adapter to these two layers you can generate images following the style and layout of image prompt, but with contents more aligned to text prompt.
+This will activate IP-Adapter at the second layer in the model's down-part block 2 and up-part block 0. The former is the layer where IP-Adapter injects layout information and the latter injects style. Inserting IP-Adapter to these two layers you can generate images following both the style and layout from image prompt, but with contents more aligned to text prompt.
 
 ```py
 style_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg")
 
-generator = torch.Generator(device="cpu").manual_seed(42)
+generator = torch.Generator(device="cpu").manual_seed(26)
 image = pipeline(
     prompt="a cat, masterpiece, best quality, high quality",
-    image=style_image,
+    ip_adapter_image=style_image,
     negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
     guidance_scale=5,
     num_inference_steps=30,
@@ -685,7 +703,7 @@ image
     <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption>
   </div>
   <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit_style_layout_cat.png"/>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"/>
     <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
   </div>
 </div>
@@ -700,10 +718,10 @@ scale = {
 }
 pipeline.set_ip_adapter_scale(scale)
 
-generator = torch.Generator(device="cpu").manual_seed(42)
+generator = torch.Generator(device="cpu").manual_seed(26)
 image = pipeline(
     prompt="a cat, masterpiece, best quality, high quality",
-    image=style_image,
+    ip_adapter_image=style_image,
     negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
     guidance_scale=5,
     num_inference_steps=30,
@@ -714,11 +732,11 @@ image
 
 <div class="flex flex-row gap-4">
   <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit_style_cat.png"/>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_only.png"/>
     <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter only in style layer</figcaption>
   </div>
   <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/30518dfe089e6bf50008875077b44cb98fb2065c/diffusers/default_out.png"/>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_ip_adapter.png"/>
     <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter in all layers</figcaption>
   </div>
 </div>
diff --git a/docs/source/en/using-diffusers/reproducibility.md b/docs/source/en/using-diffusers/reproducibility.md
deleted file mode 100644
index 7c61578d9239..000000000000
--- a/docs/source/en/using-diffusers/reproducibility.md
+++ /dev/null
@@ -1,191 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Create reproducible pipelines
-
-[[open-in-colab]]
-
-Reproducibility is important for testing, replicating results, and can even be used to [improve image quality](reusing_seeds). However, the randomness in diffusion models is a desired property because it allows the pipeline to generate different images every time it is run. While you can't expect to get the exact same results across platforms, you can expect results to be reproducible across releases and platforms within a certain tolerance range. Even then, tolerance varies depending on the diffusion pipeline and checkpoint.
-
-This is why it's important to understand how to control sources of randomness in diffusion models or use deterministic algorithms.
-
-<Tip>
-
-💡 We strongly recommend reading PyTorch's [statement about reproducibility](https://pytorch.org/docs/stable/notes/randomness.html):
-
-> Completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms. Furthermore, results may not be reproducible between CPU and GPU executions, even when using identical seeds.
-
-</Tip>
-
-## Control randomness
-
-During inference, pipelines rely heavily on random sampling operations which include creating the
-Gaussian noise tensors to denoise and adding noise to the scheduling step.
-
-Take a look at the tensor values in the [`DDIMPipeline`] after two inference steps:
-
-```python
-from diffusers import DDIMPipeline
-import numpy as np
-
-model_id = "google/ddpm-cifar10-32"
-
-# load model and scheduler
-ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
-
-# run pipeline for just two steps and return numpy tensor
-image = ddim(num_inference_steps=2, output_type="np").images
-print(np.abs(image).sum())
-```
-
-Running the code above prints one value, but if you run it again you get a different value. What is going on here?
-
-Every time the pipeline is run, [`torch.randn`](https://pytorch.org/docs/stable/generated/torch.randn.html) uses a different random seed to create Gaussian noise which is denoised stepwise. This leads to a different result each time it is run, which is great for diffusion pipelines since it generates a different random image each time.
-
-But if you need to reliably generate the same image, that'll depend on whether you're running the pipeline on a CPU or GPU.
-
-### CPU
-
-To generate reproducible results on a CPU, you'll need to use a PyTorch [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed:
-
-```python
-import torch
-from diffusers import DDIMPipeline
-import numpy as np
-
-model_id = "google/ddpm-cifar10-32"
-
-# load model and scheduler
-ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
-
-# create a generator for reproducibility
-generator = torch.Generator(device="cpu").manual_seed(0)
-
-# run pipeline for just two steps and return numpy tensor
-image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
-print(np.abs(image).sum())
-```
-
-Now when you run the code above, it always prints a value of `1491.1711` no matter what because the `Generator` object with the seed is passed to all the random functions of the pipeline.
-
-If you run this code example on your specific hardware and PyTorch version, you should get a similar, if not the same, result.
-
-<Tip>
-
-💡 It might be a bit unintuitive at first to pass `Generator` objects to the pipeline instead of
-just integer values representing the seed, but this is the recommended design when dealing with
-probabilistic models in PyTorch, as `Generator`s are *random states* that can be
-passed to multiple pipelines in a sequence.
-
-</Tip>
-
-### GPU
-
-Writing a reproducible pipeline on a GPU is a bit trickier, and full reproducibility across different hardware is not guaranteed because matrix multiplication - which diffusion pipelines require a lot of - is less deterministic on a GPU than a CPU. For example, if you run the same code example above on a GPU:
-
-```python
-import torch
-from diffusers import DDIMPipeline
-import numpy as np
-
-model_id = "google/ddpm-cifar10-32"
-
-# load model and scheduler
-ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
-ddim.to("cuda")
-
-# create a generator for reproducibility
-generator = torch.Generator(device="cuda").manual_seed(0)
-
-# run pipeline for just two steps and return numpy tensor
-image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
-print(np.abs(image).sum())
-```
-
-The result is not the same even though you're using an identical seed because the GPU uses a different random number generator than the CPU.
-
-To circumvent this problem, 🧨 Diffusers has a [`~diffusers.utils.torch_utils.randn_tensor`] function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The `randn_tensor` function is used everywhere inside the pipeline, allowing the user to **always** pass a CPU `Generator` even if the pipeline is run on a GPU.
-
-You'll see the results are much closer now!
-
-```python
-import torch
-from diffusers import DDIMPipeline
-import numpy as np
-
-model_id = "google/ddpm-cifar10-32"
-
-# load model and scheduler
-ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True)
-ddim.to("cuda")
-
-# create a generator for reproducibility; notice you don't place it on the GPU!
-generator = torch.manual_seed(0)
-
-# run pipeline for just two steps and return numpy tensor
-image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
-print(np.abs(image).sum())
-```
-
-<Tip>
-
-💡 If reproducibility is important, we recommend always passing a CPU generator.
-The performance loss is often neglectable, and you'll generate much more similar
-values than if the pipeline had been run on a GPU.
-
-</Tip>
-
-Finally, for more complex pipelines such as [`UnCLIPPipeline`], these are often extremely
-susceptible to precision error propagation. Don't expect similar results across
-different GPU hardware or PyTorch versions. In this case, you'll need to run
-exactly the same hardware and PyTorch version for full reproducibility.
-
-## Deterministic algorithms
-
-You can also configure PyTorch to use deterministic algorithms to create a reproducible pipeline. However, you should be aware that deterministic algorithms may be slower than nondeterministic ones and you may observe a decrease in performance. But if reproducibility is important to you, then this is the way to go!
-
-Nondeterministic behavior occurs when operations are launched in more than one CUDA stream. To avoid this, set the environment variable [`CUBLAS_WORKSPACE_CONFIG`](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during runtime.
-
-PyTorch typically benchmarks multiple algorithms to select the fastest one, but if you want reproducibility, you should disable this feature because the benchmark may select different algorithms each time. Lastly, pass `True` to [`torch.use_deterministic_algorithms`](https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html) to enable deterministic algorithms.
-
-```py
-import os
-import torch
-
-os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
-
-torch.backends.cudnn.benchmark = False
-torch.use_deterministic_algorithms(True)
-```
-
-Now when you run the same pipeline twice, you'll get identical results.
-
-```py
-import torch
-from diffusers import DDIMScheduler, StableDiffusionPipeline
-
-model_id = "runwayml/stable-diffusion-v1-5"
-pipe = StableDiffusionPipeline.from_pretrained(model_id, use_safetensors=True).to("cuda")
-pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-g = torch.Generator(device="cuda")
-
-prompt = "A bear is playing a guitar on Times Square"
-
-g.manual_seed(0)
-result1 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
-
-g.manual_seed(0)
-result2 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
-
-print("L_inf dist =", abs(result1 - result2).max())
-"L_inf dist = tensor(0., device='cuda:0')"
-```
diff --git a/docs/source/en/using-diffusers/reusing_seeds.md b/docs/source/en/using-diffusers/reusing_seeds.md
index bad567b55c52..989d0ba3eb07 100644
--- a/docs/source/en/using-diffusers/reusing_seeds.md
+++ b/docs/source/en/using-diffusers/reusing_seeds.md
@@ -10,72 +10,179 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Improve image quality with deterministic generation
+# Reproducible pipelines
 
-[[open-in-colab]]
+Diffusion models are inherently random which is what allows it to generate different outputs every time it is run. But there are certain times when you want to generate the same output every time, like when you're testing, replicating results, and even [improving image quality](#deterministic-batch-generation). While you can't expect to get identical results across platforms, you can expect reproducible results across releases and platforms within a certain tolerance range (though even this may vary).
 
-A common way to improve the quality of generated images is with *deterministic batch generation*, generate a batch of images and select one image to improve with a more detailed prompt in a second round of inference. The key is to pass a list of [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html#generator)'s to the pipeline for batched image generation, and tie each `Generator` to a seed so you can reuse it for an image.
+This guide will show you how to control randomness for deterministic generation on a CPU and GPU.
 
-Let's use [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) for example, and generate several versions of the following prompt:
+> [!TIP]
+> We strongly recommend reading PyTorch's [statement about reproducibility](https://pytorch.org/docs/stable/notes/randomness.html):
+>
+> "Completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms. Furthermore, results may not be reproducible between CPU and GPU executions, even when using identical seeds."
 
-```py
-prompt = "Labrador in the style of Vermeer"
+## Control randomness
+
+During inference, pipelines rely heavily on random sampling operations which include creating the
+Gaussian noise tensors to denoise and adding noise to the scheduling step.
+
+Take a look at the tensor values in the [`DDIMPipeline`] after two inference steps.
+
+```python
+from diffusers import DDIMPipeline
+import numpy as np
+
+ddim = DDIMPipeline.from_pretrained( "google/ddpm-cifar10-32", use_safetensors=True)
+image = ddim(num_inference_steps=2, output_type="np").images
+print(np.abs(image).sum())
 ```
 
-Instantiate a pipeline with [`DiffusionPipeline.from_pretrained`] and place it on a GPU (if available):
+Running the code above prints one value, but if you run it again you get a different value.
+
+Each time the pipeline is run, [torch.randn](https://pytorch.org/docs/stable/generated/torch.randn.html) uses a different random seed to create the Gaussian noise tensors. This leads to a different result each time it is run and enables the diffusion pipeline to generate a different random image each time.
+
+But if you need to reliably generate the same image, that depends on whether you're running the pipeline on a CPU or GPU.
+
+> [!TIP]
+> It might seem unintuitive to pass `Generator` objects to a pipeline instead of the integer value representing the seed. However, this is the recommended design when working with probabilistic models in PyTorch because a `Generator` is a *random state* that can be passed to multiple pipelines in a sequence. As soon as the `Generator` is consumed, the *state* is changed in place which means even if you passed the same `Generator` to a different pipeline, it won't produce the same result because the state is already changed.
+
+<hfoptions id="hardware">
+<hfoption id="CPU">
+
+To generate reproducible results on a CPU, you'll need to use a PyTorch [Generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed. Now when you run the code, it always prints a value of `1491.1711` because the `Generator` object with the seed is passed to all the random functions in the pipeline. You should get a similar, if not the same, result on whatever hardware and PyTorch version you're using.
 
 ```python
 import torch
-from diffusers import DiffusionPipeline
-from diffusers.utils import make_image_grid
+import numpy as np
+from diffusers import DDIMPipeline
 
-pipe = DiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
-)
-pipe = pipe.to("cuda")
+ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
+generator = torch.Generator(device="cpu").manual_seed(0)
+image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
+print(np.abs(image).sum())
 ```
 
-Now, define four different `Generator`s and assign each `Generator` a seed (`0` to `3`) so you can reuse a `Generator` later for a specific image:
+</hfoption>
+<hfoption id="GPU">
+
+Writing a reproducible pipeline on a GPU is a bit trickier, and full reproducibility across different hardware is not guaranteed because matrix multiplication - which diffusion pipelines require a lot of - is less deterministic on a GPU than a CPU. For example, if you run the same code example from the CPU example, you'll get a different result even though the seed is identical. This is because the GPU uses a different random number generator than the CPU.
 
 ```python
-generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(4)]
+import torch
+import numpy as np
+from diffusers import DDIMPipeline
+
+ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
+ddim.to("cuda")
+generator = torch.Generator(device="cuda").manual_seed(0)
+image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
+print(np.abs(image).sum())
+```
+
+To avoid this issue, Diffusers has a [`~utils.torch_utils.randn_tensor`] function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The [`~utils.torch_utils.randn_tensor`] function is used everywhere inside the pipeline. Now you can call [torch.manual_seed](https://pytorch.org/docs/stable/generated/torch.manual_seed.html) which automatically creates a CPU `Generator` that can be passed to the pipeline even if it is being run on a GPU.
+
+```python
+import torch
+import numpy as np
+from diffusers import DDIMPipeline
+
+ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
+ddim.to("cuda")
+generator = torch.manual_seed(0)
+image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
+print(np.abs(image).sum())
 ```
 
-<Tip warning={true}>
+> [!TIP]
+> If reproducibility is important to your use case, we recommend always passing a CPU `Generator`. The performance loss is often negligible and you'll generate more similar values than if the pipeline had been run on a GPU.
+
+Finally, more complex pipelines such as [`UnCLIPPipeline`], are often extremely
+susceptible to precision error propagation. You'll need to use
+exactly the same hardware and PyTorch version for full reproducibility.
+
+</hfoption>
+</hfoptions>
+
+## Deterministic algorithms
+
+You can also configure PyTorch to use deterministic algorithms to create a reproducible pipeline. The downside is that deterministic algorithms may be slower than non-deterministic ones and you may observe a decrease in performance.
 
-To create a batched seed, you should use a list comprehension that iterates over the length specified in `range()`. This creates a unique `Generator` object for each image in the batch. If you only multiply the `Generator` by the batch size, this only creates one `Generator` object that is used sequentially for each image in the batch.
+Non-deterministic behavior occurs when operations are launched in more than one CUDA stream. To avoid this, set the environment variable [CUBLAS_WORKSPACE_CONFIG](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during runtime.
 
-For example, if you want to use the same seed to create 4 identical images:
+PyTorch typically benchmarks multiple algorithms to select the fastest one, but if you want reproducibility, you should disable this feature because the benchmark may select different algorithms each time. Set Diffusers [enable_full_determinism](https://github.com/huggingface/diffusers/blob/142f353e1c638ff1d20bd798402b68f72c1ebbdd/src/diffusers/utils/testing_utils.py#L861) to enable deterministic algorithms.
 
 ```py
-❌ [torch.Generator().manual_seed(seed)] * 4
+enable_full_determinism()
+```
+
+Now when you run the same pipeline twice, you'll get identical results.
+
+```py
+import torch
+from diffusers import DDIMScheduler, StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True).to("cuda")
+pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+g = torch.Generator(device="cuda")
+
+prompt = "A bear is playing a guitar on Times Square"
 
-✅ [torch.Generator().manual_seed(seed) for _ in range(4)]
+g.manual_seed(0)
+result1 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
+
+g.manual_seed(0)
+result2 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
+
+print("L_inf dist =", abs(result1 - result2).max())
+"L_inf dist = tensor(0., device='cuda:0')"
 ```
 
-</Tip>
+## Deterministic batch generation
 
-Generate the images and have a look:
+A practical application of creating reproducible pipelines is *deterministic batch generation*. You generate a batch of images and select one image to improve with a more detailed prompt. The main idea is to pass a list of [Generator's](https://pytorch.org/docs/stable/generated/torch.Generator.html) to the pipeline and tie each `Generator` to a seed so you can reuse it.
 
-```python
-images = pipe(prompt, generator=generator, num_images_per_prompt=4).images
-make_image_grid(images, rows=2, cols=2)
+Let's use the [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint and generate a batch of images.
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+from diffusers.utils import make_image_grid
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
+)
+pipeline = pipeline.to("cuda")
 ```
 
-![img](https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds.jpg)
+Define four different `Generator`s and assign each `Generator` a seed (`0` to `3`). Then generate a batch of images and pick one to iterate on.
 
-In this example, you'll improve upon the first image - but in reality, you can use any image you want (even the image with double sets of eyes!). The first image used the `Generator` with seed `0`, so you'll reuse that `Generator` for the second round of inference. To improve the quality of the image, add some additional text to the prompt:
+> [!WARNING]
+> Use a list comprehension that iterates over the batch size specified in `range()` to create a unique `Generator` object for each image in the batch. If you multiply the `Generator` by the batch size integer, it only creates *one* `Generator` object that is used sequentially for each image in the batch.
+>
+> ```py
+> [torch.Generator().manual_seed(seed)] * 4
+> ```
 
 ```python
-prompt = [prompt + t for t in [", highly realistic", ", artsy", ", trending", ", colorful"]]
-generator = [torch.Generator(device="cuda").manual_seed(0) for i in range(4)]
+generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(4)]
+prompt = "Labrador in the style of Vermeer"
+images = pipeline(prompt, generator=generator, num_images_per_prompt=4).images[0]
+make_image_grid(images, rows=2, cols=2)
 ```
 
-Create four generators with seed `0`, and generate another batch of images, all of which should look like the first image from the previous round!
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds.jpg"/>
+</div>
+
+Let's improve the first image (you can choose any image you want) which corresponds to the `Generator` with seed `0`. Add some additional text to your prompt and then make sure you reuse the same `Generator` with seed `0`. All the generated images should resemble the first image.
 
 ```python
-images = pipe(prompt, generator=generator).images
+prompt = [prompt + t for t in [", highly realistic", ", artsy", ", trending", ", colorful"]]
+generator = [torch.Generator(device="cuda").manual_seed(0) for i in range(4)]
+images = pipeline(prompt, generator=generator).images
 make_image_grid(images, rows=2, cols=2)
 ```
 
-![img](https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds_2.jpg)
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds_2.jpg"/>
+</div>
diff --git a/docs/source/ja/stable_diffusion.md b/docs/source/ja/stable_diffusion.md
index cae244dab2b6..b22178b75529 100644
--- a/docs/source/ja/stable_diffusion.md
+++ b/docs/source/ja/stable_diffusion.md
@@ -49,7 +49,7 @@ prompt = "portrait photo of a old warrior chief"
 pipeline = pipeline.to("cuda")
 ```
 
-同じイメージを使って改良できるようにするには、[`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)を使い、[reproducibility](./using-diffusers/reproducibility)の種を設定します：
+同じイメージを使って改良できるようにするには、[`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)を使い、[reproducibility](./using-diffusers/reusing_seeds)の種を設定します：
 
 ```python
 import torch
diff --git a/docs/source/ko/stable_diffusion.md b/docs/source/ko/stable_diffusion.md
index 678db4ff9eae..342f17bbff31 100644
--- a/docs/source/ko/stable_diffusion.md
+++ b/docs/source/ko/stable_diffusion.md
@@ -49,7 +49,7 @@ prompt = "portrait photo of a old warrior chief"
 pipeline = pipeline.to("cuda")
 ```
 
-동일한 이미지를 사용하고 개선할 수 있는지 확인하려면 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)를 사용하고 [재현성](./using-diffusers/reproducibility)에 대한 시드를 설정하세요:
+동일한 이미지를 사용하고 개선할 수 있는지 확인하려면 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)를 사용하고 [재현성](./using-diffusers/reusing_seeds)에 대한 시드를 설정하세요:
 
 ```python
 import torch
diff --git a/docs/source/ko/training/lora.md b/docs/source/ko/training/lora.md
index 5bb8a1e69be4..e9c690d80652 100644
--- a/docs/source/ko/training/lora.md
+++ b/docs/source/ko/training/lora.md
@@ -49,15 +49,15 @@ huggingface-cli login
 
 ### 학습[[dreambooth-training]]
 
-[Pokémon BLIP 캡션](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) 데이터셋으로 [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)를 파인튜닝해 나만의 포켓몬을 생성해 보겠습니다.
+[Naruto BLIP 캡션](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) 데이터셋으로 [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)를 파인튜닝해 나만의 포켓몬을 생성해 보겠습니다.
 
 시작하려면 `MODEL_NAME` 및 `DATASET_NAME` 환경 변수가 설정되어 있는지 확인하십시오. `OUTPUT_DIR` 및 `HUB_MODEL_ID` 변수는 선택 사항이며 허브에서 모델을 저장할 위치를 지정합니다.
 
 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export OUTPUT_DIR="/sddata/finetune/lora/pokemon"
-export HUB_MODEL_ID="pokemon-lora"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export OUTPUT_DIR="/sddata/finetune/lora/naruto"
+export HUB_MODEL_ID="naruto-lora"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 ```
 
 학습을 시작하기 전에 알아야 할 몇 가지 플래그가 있습니다.
diff --git a/docs/source/ko/training/text2image.md b/docs/source/ko/training/text2image.md
index f2ad3bb0719e..8a0463b497f4 100644
--- a/docs/source/ko/training/text2image.md
+++ b/docs/source/ko/training/text2image.md
@@ -73,12 +73,12 @@ xFormers는 Flax에 사용할 수 없습니다.
 
 <frameworkcontent>
 <pt>
-다음과 같이 [Pokémon BLIP 캡션](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) 데이터셋에서 파인튜닝 실행을 위해 [PyTorch 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)를 실행합니다:
+다음과 같이 [Naruto BLIP 캡션](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) 데이터셋에서 파인튜닝 실행을 위해 [PyTorch 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)를 실행합니다:
 
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export dataset_name="lambdalabs/pokemon-blip-captions"
+export dataset_name="lambdalabs/naruto-blip-captions"
 
 accelerate launch train_text_to_image.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
@@ -93,7 +93,7 @@ accelerate launch train_text_to_image.py \
   --learning_rate=1e-05 \
   --max_grad_norm=1 \
   --lr_scheduler="constant" --lr_warmup_steps=0 \
-  --output_dir="sd-pokemon-model" 
+  --output_dir="sd-naruto-model" 
 ```
 
 자체 데이터셋으로 파인튜닝하려면 🤗 [Datasets](https://huggingface.co/docs/datasets/index)에서 요구하는 형식에 따라 데이터셋을 준비하세요. [데이터셋을 허브에 업로드](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub)하거나 [파일들이 있는 로컬 폴더를 준비](https ://huggingface.co/docs/datasets/image_dataset#imagefolder)할 수 있습니다.
@@ -136,7 +136,7 @@ pip install -U -r requirements_flax.txt
 
 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export dataset_name="lambdalabs/pokemon-blip-captions"
+export dataset_name="lambdalabs/naruto-blip-captions"
 
 python train_text_to_image_flax.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
@@ -146,7 +146,7 @@ python train_text_to_image_flax.py \
   --max_train_steps=15000 \
   --learning_rate=1e-05 \
   --max_grad_norm=1 \
-  --output_dir="sd-pokemon-model" 
+  --output_dir="sd-naruto-model" 
 ```
 
 자체 데이터셋으로 파인튜닝하려면 🤗 [Datasets](https://huggingface.co/docs/datasets/index)에서 요구하는 형식에 따라 데이터셋을 준비하세요. [데이터셋을 허브에 업로드](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub)하거나 [파일들이 있는 로컬 폴더를 준비](https ://huggingface.co/docs/datasets/image_dataset#imagefolder)할 수 있습니다.
@@ -166,7 +166,7 @@ python train_text_to_image_flax.py \
   --max_train_steps=15000 \
   --learning_rate=1e-05 \
   --max_grad_norm=1 \
-  --output_dir="sd-pokemon-model"
+  --output_dir="sd-naruto-model"
 ```
 </jax>
 </frameworkcontent>
@@ -189,7 +189,7 @@ pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.flo
 pipe.to("cuda")
 
 image = pipe(prompt="yoda").images[0]
-image.save("yoda-pokemon.png")
+image.save("yoda-naruto.png")
 ```
 </pt>
 <jax>
@@ -203,7 +203,7 @@ from diffusers import FlaxStableDiffusionPipeline
 model_path = "path_to_saved_model"
 pipe, params = FlaxStableDiffusionPipeline.from_pretrained(model_path, dtype=jax.numpy.bfloat16)
 
-prompt = "yoda pokemon"
+prompt = "yoda naruto"
 prng_seed = jax.random.PRNGKey(0)
 num_inference_steps = 50
 
@@ -218,7 +218,7 @@ prompt_ids = shard(prompt_ids)
 
 images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
 images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
-image.save("yoda-pokemon.png")
+image.save("yoda-naruto.png")
 ```
 </jax>
 </frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/ko/training/unconditional_training.md b/docs/source/ko/training/unconditional_training.md
index d0c200ef2daa..de9ae39a7a76 100644
--- a/docs/source/ko/training/unconditional_training.md
+++ b/docs/source/ko/training/unconditional_training.md
@@ -103,13 +103,13 @@ accelerate launch train_unconditional.py \
 <div class="flex justify-center">
     <img src="https://user-images.githubusercontent.com/26864830/180248660-a0b143d0-b89a-42c5-8656-2ebf6ece7e52.png"/>
 </div>
-[Pokemon](https://huggingface.co/datasets/huggan/pokemon) 데이터셋을 사용할 경우:
+[Naruto](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) 데이터셋을 사용할 경우:
 
 ```bash
 accelerate launch train_unconditional.py \
-  --dataset_name="huggan/pokemon" \
+  --dataset_name="lambdalabs/naruto-blip-captions" \
   --resolution=64 \
-  --output_dir="ddpm-ema-pokemon-64" \
+  --output_dir="ddpm-ema-naruto-64" \
   --train_batch_size=16 \
   --num_epochs=100 \
   --gradient_accumulation_steps=1 \
@@ -129,9 +129,9 @@ accelerate launch train_unconditional.py \
 
 ```bash
 accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \
-  --dataset_name="huggan/pokemon" \
+  --dataset_name="lambdalabs/naruto-blip-captions" \
   --resolution=64 --center_crop --random_flip \
-  --output_dir="ddpm-ema-pokemon-64" \
+  --output_dir="ddpm-ema-naruto-64" \
   --train_batch_size=16 \
   --num_epochs=100 \
   --gradient_accumulation_steps=1 \
diff --git a/docs/source/zh/stable_diffusion.md b/docs/source/zh/stable_diffusion.md
index 614d9505d7b8..d92cdf7d1163 100644
--- a/docs/source/zh/stable_diffusion.md
+++ b/docs/source/zh/stable_diffusion.md
@@ -51,7 +51,7 @@ prompt = "portrait photo of a old warrior chief"
 pipeline = pipeline.to("cuda")
 ```
 
-为了确保您可以使用相同的图像并对其进行改进，使用 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) 方法，然后设置一个随机数种子 以确保其 [复现性](./using-diffusers/reproducibility):
+为了确保您可以使用相同的图像并对其进行改进，使用 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) 方法，然后设置一个随机数种子 以确保其 [复现性](./using-diffusers/reusing_seeds):
 
 ```python
 import torch
diff --git a/examples/advanced_diffusion_training/README.md b/examples/advanced_diffusion_training/README.md
index fda73f9ce7a5..a13ae719cfdc 100644
--- a/examples/advanced_diffusion_training/README.md
+++ b/examples/advanced_diffusion_training/README.md
@@ -234,7 +234,7 @@ In ComfyUI we will load a LoRA and a textual embedding at the same time.
 SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
 
 ### DoRA training 
-The advanced script now supports DoRA training too!
+The advanced script supports DoRA training too!
 > Proposed in [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353), 
 **DoRA** is very similar to LoRA, except it decomposes the pre-trained weight into two components, **magnitude** and **direction** and employs LoRA for _directional_ updates to efficiently minimize the number of trainable parameters. 
 The authors found that by using DoRA, both the learning capacity and training stability of LoRA are enhanced without any additional overhead during inference. 
@@ -304,6 +304,147 @@ accelerate launch train_dreambooth_lora_sdxl_advanced.py \
 > [!CAUTION]
 > Min-SNR gamma is not supported with the EDM-style training yet. When training with the PlaygroundAI model, it's recommended to not pass any "variant".
 
+### B-LoRA training 
+The advanced script now supports B-LoRA training too!
+> Proposed in [Implicit Style-Content Separation using B-LoRA](https://arxiv.org/abs/2403.14572), 
+B-LoRA is a method that leverages LoRA to implicitly separate the style and content components of a **single** image.
+It was shown that learning the LoRA weights of two specific blocks (referred to as B-LoRAs) 
+achieves style-content separation that cannot be achieved by training each B-LoRA independently. 
+Once trained, the two B-LoRAs can be used as independent components to allow various image stylization tasks
+
+**Usage**
+Enable B-LoRA training by adding this flag
+```bash
+--use_blora
+```
+You can train a B-LoRA with as little as 1 image, and 1000 steps. Try this default configuration as a start:
+```bash
+!accelerate launch train_dreambooth_b-lora_sdxl.py \
+ --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" \
+ --instance_data_dir="linoyts/B-LoRA_teddy_bear" \
+ --output_dir="B-LoRA_teddy_bear" \
+ --instance_prompt="a [v18]" \
+ --resolution=1024 \
+ --rank=64 \
+ --train_batch_size=1 \
+ --learning_rate=5e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --max_train_steps=1000 \
+ --checkpointing_steps=2000 \
+ --seed="0" \
+ --gradient_checkpointing \
+ --mixed_precision="fp16"
+```
+**Inference** 
+The inference is a bit different:
+1. we need load *specific* unet layers (as opposed to a regular LoRA/DoRA)
+2. the trained layers we load, changes based on our objective (e.g. style/content)
+
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline, AutoencoderKL
+
+# taken & modified from B-LoRA repo - https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
+def is_belong_to_blocks(key, blocks):
+    try:
+        for g in blocks:
+            if g in key:
+                return True
+        return False
+    except Exception as e:
+        raise type(e)(f'failed to is_belong_to_block, due to: {e}')
+    
+def lora_lora_unet_blocks(lora_path, alpha, target_blocks):  
+  state_dict, _ = pipeline.lora_state_dict(lora_path)
+  filtered_state_dict = {k: v * alpha for k, v in state_dict.items() if is_belong_to_blocks(k, target_blocks)}
+  return filtered_state_dict
+
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    vae=vae,
+    torch_dtype=torch.float16,
+).to("cuda")
+
+# pick a blora for content/style (you can also set one to None) 
+content_B_lora_path  = "lora-library/B-LoRA-teddybear"
+style_B_lora_path= "lora-library/B-LoRA-pen_sketch"
+
+
+content_B_LoRA = lora_lora_unet_blocks(content_B_lora_path,alpha=1,target_blocks=["unet.up_blocks.0.attentions.0"])
+style_B_LoRA = lora_lora_unet_blocks(style_B_lora_path,alpha=1.1,target_blocks=["unet.up_blocks.0.attentions.1"])
+combined_lora = {**content_B_LoRA, **style_B_LoRA}
+
+# Load both loras
+pipeline.load_lora_into_unet(combined_lora, None, pipeline.unet)
+
+#generate
+prompt = "a [v18] in [v30] style"
+pipeline(prompt, num_images_per_prompt=4).images
+```
+### LoRA training of Targeted U-net Blocks
+The advanced script now supports custom choice of U-net blocks to train during Dreambooth LoRA tuning. 
+> [!NOTE]
+> This feature is still experimental
+
+> Recently, works like B-LoRA showed the potential advantages of learning the LoRA weights of specific U-net blocks, not only in speed & memory, 
+> but also in reducing the amount of needed data, improving style manipulation and overcoming overfitting issues. 
+> In light of this, we're introducing a new feature to the advanced script to allow for configurable U-net learned blocks. 
+
+**Usage**
+Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma seperated string specifying the targeted blocks. 
+e.g:
+```bash
+--lora_unet_blocks="unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1"
+```
+
+> [!NOTE]
+> if you specify both `--use_blora` and `--lora_unet_blocks`, values given in --lora_unet_blocks will be ignored. 
+> When enabling --use_blora, targeted U-net blocks are automatically set to be "unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1" as discussed in the paper. 
+> If you wish to experiment with different blocks, specify `--lora_unet_blocks` only.
+
+**Inference** 
+Inference is the same as for B-LoRAs, except the input targeted blocks should be modified based on your training configuration. 
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline, AutoencoderKL
+
+# taken & modified from B-LoRA repo - https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
+def is_belong_to_blocks(key, blocks):
+    try:
+        for g in blocks:
+            if g in key:
+                return True
+        return False
+    except Exception as e:
+        raise type(e)(f'failed to is_belong_to_block, due to: {e}')
+    
+def lora_lora_unet_blocks(lora_path, alpha, target_blocks):  
+  state_dict, _ = pipeline.lora_state_dict(lora_path)
+  filtered_state_dict = {k: v * alpha for k, v in state_dict.items() if is_belong_to_blocks(k, target_blocks)}
+  return filtered_state_dict
+
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    vae=vae,
+    torch_dtype=torch.float16,
+).to("cuda")
+
+lora_path  = "lora-library/B-LoRA-pen_sketch"
+
+state_dict = lora_lora_unet_blocks(content_B_lora_path,alpha=1,target_blocks=["unet.up_blocks.0.attentions.0"])
+
+# Load traine dlora layers into the unet
+pipeline.load_lora_into_unet(state_dict, None, pipeline.unet)
+
+#generate
+prompt = "a dog in [v30] style"
+pipeline(prompt, num_images_per_prompt=4).images
+```
+
+
 ### Tips and Tricks
 Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices)
 
diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
index 21a84b77245a..0699ac17077d 100644
--- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
+++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
@@ -15,7 +15,6 @@
 
 import argparse
 import gc
-import hashlib
 import itertools
 import json
 import logging
@@ -40,6 +39,7 @@
 from accelerate.logging import get_logger
 from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
 from huggingface_hub import create_repo, hf_hub_download, upload_folder
+from huggingface_hub.utils import insecure_hashlib
 from packaging import version
 from peft import LoraConfig, set_peft_model_state_dict
 from peft.utils import get_peft_model_state_dict
@@ -696,6 +696,23 @@ def parse_args(input_args=None):
             "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
         ),
     )
+    parser.add_argument(
+        "--lora_unet_blocks",
+        type=str,
+        default=None,
+        help=(
+            "the U-net blocks to tune during training. please specify them in a comma separated string, e.g. `unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1` etc."
+            "NOTE: By default (if not specified) - regular LoRA training is performed. "
+            "if --use_blora is enabled, this arg will be ignored, since in B-LoRA training, targeted U-net blocks are `unet.up_blocks.0.attentions.0` and `unet.up_blocks.0.attentions.1`"
+        ),
+    )
+    parser.add_argument(
+        "--use_blora",
+        action="store_true",
+        help=(
+            "Whether to train a B-LoRA as proposed in- Implicit Style-Content Separation using B-LoRA https://arxiv.org/abs/2403.14572. "
+        ),
+    )
     parser.add_argument(
         "--cache_latents",
         action="store_true",
@@ -720,6 +737,11 @@ def parse_args(input_args=None):
             "For full LoRA text encoder training check --train_text_encoder, for textual "
             "inversion training check `--train_text_encoder_ti`"
         )
+    if args.use_blora and args.lora_unet_blocks:
+        warnings.warn(
+            "You specified both `--use_blora` and `--lora_unet_blocks`, for B-LoRA training, target unet blocks are: `unet.up_blocks.0.attentions.0` and `unet.up_blocks.0.attentions.1`. "
+            "If you wish to target different U-net blocks, don't enable `--use_blora`"
+        )
 
     env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
     if env_local_rank != -1 and env_local_rank != args.local_rank:
@@ -740,6 +762,40 @@ def parse_args(input_args=None):
     return args
 
 
+# Taken (and slightly modified) from B-LoRA repo https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
+def is_belong_to_blocks(key, blocks):
+    try:
+        for g in blocks:
+            if g in key:
+                return True
+        return False
+    except Exception as e:
+        raise type(e)(f"failed to is_belong_to_block, due to: {e}")
+
+
+def get_unet_lora_target_modules(unet, use_blora, target_blocks=None):
+    if use_blora:
+        content_b_lora_blocks = "unet.up_blocks.0.attentions.0"
+        style_b_lora_blocks = "unet.up_blocks.0.attentions.1"
+        target_blocks = [content_b_lora_blocks, style_b_lora_blocks]
+    try:
+        blocks = [(".").join(blk.split(".")[1:]) for blk in target_blocks]
+
+        attns = [
+            attn_processor_name.rsplit(".", 1)[0]
+            for attn_processor_name, _ in unet.attn_processors.items()
+            if is_belong_to_blocks(attn_processor_name, blocks)
+        ]
+
+        target_modules = [f"{attn}.{mat}" for mat in ["to_k", "to_q", "to_v", "to_out.0"] for attn in attns]
+        return target_modules
+    except Exception as e:
+        raise type(e)(
+            f"failed to get_target_modules, due to: {e}. "
+            f"Please check the modules specified in --lora_unet_blocks are correct"
+        )
+
+
 # Taken from https://github.com/replicate/cog-sdxl/blob/main/dataset_and_utils.py
 class TokenEmbeddingsHandler:
     def __init__(self, text_encoders, tokenizers):
@@ -946,16 +1002,20 @@ def __init__(
                 transforms.Normalize([0.5], [0.5]),
             ]
         )
+        # if using B-LoRA for single image. do not use transformations
+        single_image = len(self.instance_images) < 2
         for image in self.instance_images:
-            image = exif_transpose(image)
+            if not single_image:
+                image = exif_transpose(image)
             if not image.mode == "RGB":
                 image = image.convert("RGB")
             self.original_sizes.append((image.height, image.width))
             image = train_resize(image)
-            if args.random_flip and random.random() < 0.5:
+
+            if not single_image and args.random_flip and random.random() < 0.5:
                 # flip
                 image = train_flip(image)
-            if args.center_crop:
+            if args.center_crop or single_image:
                 y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
                 x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
                 image = train_crop(image)
@@ -1216,7 +1276,7 @@ def main(args):
                 images = pipeline(example["prompt"]).images
 
                 for i, image in enumerate(images):
-                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
                     image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
                     image.save(image_filename)
 
@@ -1374,12 +1434,24 @@ def main(args):
             text_encoder_two.gradient_checkpointing_enable()
 
     # now we will add new LoRA weights to the attention layers
+
+    if args.use_blora:
+        # if using B-LoRA, the targeted blocks to train are automatically set
+        target_modules = get_unet_lora_target_modules(unet, use_blora=True)
+    elif args.lora_unet_blocks:
+        # if training specific unet blocks not in the B-LoRA scheme
+        target_blocks_list = "".join(args.lora_unet_blocks.split()).split(",")
+        logger.info(f"list of unet blocks to train: {target_blocks_list}")
+        target_modules = get_unet_lora_target_modules(unet, use_blora=False, target_blocks=target_blocks_list)
+    else:
+        target_modules = ["to_k", "to_q", "to_v", "to_out.0"]
+
     unet_lora_config = LoraConfig(
         r=args.rank,
-        lora_alpha=args.rank,
         use_dora=args.use_dora,
+        lora_alpha=args.rank,
         init_lora_weights="gaussian",
-        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+        target_modules=target_modules,
     )
     unet.add_adapter(unet_lora_config)
 
@@ -1388,8 +1460,8 @@ def main(args):
     if args.train_text_encoder:
         text_lora_config = LoraConfig(
             r=args.rank,
-            lora_alpha=args.rank,
             use_dora=args.use_dora,
+            lora_alpha=args.rank,
             init_lora_weights="gaussian",
             target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
         )
@@ -1505,6 +1577,7 @@ def load_model_hook(models, input_dir):
             models = [unet_]
             if args.train_text_encoder:
                 models.extend([text_encoder_one_, text_encoder_two_])
+                # only upcast trainable parameters (LoRA) into fp32
             cast_training_params(models)
 
     accelerator.register_save_state_pre_hook(save_model_hook)
@@ -1525,6 +1598,8 @@ def load_model_hook(models, input_dir):
         models = [unet]
         if args.train_text_encoder:
             models.extend([text_encoder_one, text_encoder_two])
+
+        # only upcast trainable parameters (LoRA) into fp32
         cast_training_params(models, dtype=torch.float32)
 
     unet_lora_parameters = list(filter(lambda p: p.requires_grad, unet.parameters()))
@@ -1780,7 +1855,12 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
     # We need to initialize the trackers we use, and also store our configuration.
     # The trackers initializes automatically on the main process.
     if accelerator.is_main_process:
-        accelerator.init_trackers("dreambooth-lora-sd-xl", config=vars(args))
+        tracker_name = (
+            "dreambooth-lora-sd-xl"
+            if "playground" not in args.pretrained_model_name_or_path
+            else "dreambooth-lora-playground"
+        )
+        accelerator.init_trackers(tracker_name, config=vars(args))
 
     # Train!
     total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
@@ -1833,7 +1913,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
     )
 
     def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
-        # TODO: revisit other sampling algorithms
         sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
         schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
         timesteps = timesteps.to(accelerator.device)
@@ -1852,6 +1931,7 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
     # flag used for textual inversion
     pivoted = False
     for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
         # if performing any kind of optimization of text_encoder params
         if args.train_text_encoder or args.train_text_encoder_ti:
             if epoch == num_train_epochs_text_encoder:
@@ -1869,7 +1949,6 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                     text_encoder_one.text_model.embeddings.requires_grad_(True)
                     text_encoder_two.text_model.embeddings.requires_grad_(True)
 
-        unet.train()
         for step, batch in enumerate(train_dataloader):
             if pivoted:
                 # stopping optimization of text_encoder params
@@ -1970,7 +2049,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                         timesteps,
                         prompt_embeds_input,
                         added_cond_kwargs=unet_added_conditions,
-                    ).sample
+                        return_dict=False,
+                    )[0]
                 else:
                     unet_added_conditions = {"time_ids": add_time_ids}
                     prompt_embeds, pooled_prompt_embeds = encode_prompt(
@@ -1988,7 +2068,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                         timesteps,
                         prompt_embeds_input,
                         added_cond_kwargs=unet_added_conditions,
-                    ).sample
+                        return_dict=False,
+                    )[0]
 
                 weighting = None
                 if args.do_edm_style_training:
diff --git a/examples/community/checkpoint_merger.py b/examples/community/checkpoint_merger.py
index 9df5943a86b1..f702bf0cea9b 100644
--- a/examples/community/checkpoint_merger.py
+++ b/examples/community/checkpoint_merger.py
@@ -138,7 +138,6 @@ def merge(self, pretrained_model_name_or_path_list: List[Union[str, os.PathLike]
             comparison_result &= self._compare_model_configs(config_dicts[idx - 1], config_dicts[idx])
             if not force and comparison_result is False:
                 raise ValueError("Incompatible checkpoints. Please check model_index.json for the models.")
-                print(config_dicts[0], config_dicts[1])
         print("Compatible model_index.json files found")
         # Step 2: Basic Validation has succeeded. Let's download the models and save them into our local files.
         cached_folders = []
diff --git a/examples/community/clip_guided_stable_diffusion_img2img.py b/examples/community/clip_guided_stable_diffusion_img2img.py
index 434d5253679a..c8e0a9094f22 100644
--- a/examples/community/clip_guided_stable_diffusion_img2img.py
+++ b/examples/community/clip_guided_stable_diffusion_img2img.py
@@ -359,9 +359,16 @@ def __call__(
 
         # Preprocess image
         image = preprocess(image, width, height)
-        latents = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, self.device, generator
-        )
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                text_embeddings.dtype,
+                self.device,
+                generator,
+            )
 
         if clip_guidance_scale > 0:
             if clip_prompt is not None:
diff --git a/examples/community/latent_consistency_img2img.py b/examples/community/latent_consistency_img2img.py
index 35cd74166c68..3c5ffa845699 100644
--- a/examples/community/latent_consistency_img2img.py
+++ b/examples/community/latent_consistency_img2img.py
@@ -240,14 +240,6 @@ def prepare_latents(
 
         return latents
 
-        if latents is None:
-            latents = torch.randn(shape, dtype=dtype).to(device)
-        else:
-            latents = latents.to(device)
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
     def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
         see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
@@ -335,17 +327,18 @@ def __call__(
 
         # 5. Prepare latent variable
         num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            image,
-            latent_timestep,
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            latents,
-        )
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                latents,
+            )
         bs = batch_size * num_images_per_prompt
 
         # 6. Get Guidance Scale Embedding
diff --git a/examples/community/pipeline_demofusion_sdxl.py b/examples/community/pipeline_demofusion_sdxl.py
index 93e1463638f0..f46d635dae2b 100644
--- a/examples/community/pipeline_demofusion_sdxl.py
+++ b/examples/community/pipeline_demofusion_sdxl.py
@@ -1304,7 +1304,11 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
             if isinstance(component, torch.nn.Module):
                 if hasattr(component, "_hf_hook"):
                     is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    is_sequential_cpu_offload = (
+                        isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                        or hasattr(component._hf_hook, "hooks")
+                        and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+                    )
                     logger.info(
                         "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
                     )
diff --git a/examples/community/pipeline_stable_diffusion_pag.py b/examples/community/pipeline_stable_diffusion_pag.py
index 04f38a888460..cdb7bd99cb29 100644
--- a/examples/community/pipeline_stable_diffusion_pag.py
+++ b/examples/community/pipeline_stable_diffusion_pag.py
@@ -1,4 +1,5 @@
-# Implementation of StableDiffusionPAGPipeline
+# Implementation of StableDiffusionPipeline with PAG
+# https://ku-cvlab.github.io/Perturbed-Attention-Guidance
 
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Union
@@ -134,8 +135,8 @@ def __call__(
 
         value = attn.to_v(hidden_states_ptb)
 
-        hidden_states_ptb = torch.zeros(value.shape).to(value.get_device())
-        # hidden_states_ptb = value
+        # hidden_states_ptb = torch.zeros(value.shape).to(value.get_device())
+        hidden_states_ptb = value
 
         hidden_states_ptb = hidden_states_ptb.to(query.dtype)
 
@@ -1045,7 +1046,7 @@ def pag_scale(self):
         return self._pag_scale
 
     @property
-    def do_adversarial_guidance(self):
+    def do_perturbed_attention_guidance(self):
         return self._pag_scale > 0
 
     @property
@@ -1056,14 +1057,6 @@ def pag_adaptive_scaling(self):
     def do_pag_adaptive_scaling(self):
         return self._pag_adaptive_scaling > 0
 
-    @property
-    def pag_drop_rate(self):
-        return self._pag_drop_rate
-
-    @property
-    def pag_applied_layers(self):
-        return self._pag_applied_layers
-
     @property
     def pag_applied_layers_index(self):
         return self._pag_applied_layers_index
@@ -1080,8 +1073,6 @@ def __call__(
         guidance_scale: float = 7.5,
         pag_scale: float = 0.0,
         pag_adaptive_scaling: float = 0.0,
-        pag_drop_rate: float = 0.5,
-        pag_applied_layers: List[str] = ["down"],  # ['down', 'mid', 'up']
         pag_applied_layers_index: List[str] = ["d4"],  # ['d4', 'd5', 'm0']
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
@@ -1221,8 +1212,6 @@ def __call__(
 
         self._pag_scale = pag_scale
         self._pag_adaptive_scaling = pag_adaptive_scaling
-        self._pag_drop_rate = pag_drop_rate
-        self._pag_applied_layers = pag_applied_layers
         self._pag_applied_layers_index = pag_applied_layers_index
 
         # 2. Define call parameters
@@ -1257,13 +1246,13 @@ def __call__(
         # to avoid doing two forward passes
 
         # cfg
-        if self.do_classifier_free_guidance and not self.do_adversarial_guidance:
+        if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
         # pag
-        elif not self.do_classifier_free_guidance and self.do_adversarial_guidance:
+        elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
             prompt_embeds = torch.cat([prompt_embeds, prompt_embeds])
         # both
-        elif self.do_classifier_free_guidance and self.do_adversarial_guidance:
+        elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds, prompt_embeds])
 
         if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
@@ -1306,7 +1295,7 @@ def __call__(
             ).to(device=device, dtype=latents.dtype)
 
         # 7. Denoising loop
-        if self.do_adversarial_guidance:
+        if self.do_perturbed_attention_guidance:
             down_layers = []
             mid_layers = []
             up_layers = []
@@ -1322,6 +1311,29 @@ def __call__(
                     else:
                         raise ValueError(f"Invalid layer type: {layer_type}")
 
+        # change attention layer in UNet if use PAG
+        if self.do_perturbed_attention_guidance:
+            if self.do_classifier_free_guidance:
+                replace_processor = PAGCFGIdentitySelfAttnProcessor()
+            else:
+                replace_processor = PAGIdentitySelfAttnProcessor()
+
+            drop_layers = self.pag_applied_layers_index
+            for drop_layer in drop_layers:
+                try:
+                    if drop_layer[0] == "d":
+                        down_layers[int(drop_layer[1])].processor = replace_processor
+                    elif drop_layer[0] == "m":
+                        mid_layers[int(drop_layer[1])].processor = replace_processor
+                    elif drop_layer[0] == "u":
+                        up_layers[int(drop_layer[1])].processor = replace_processor
+                    else:
+                        raise ValueError(f"Invalid layer type: {drop_layer[0]}")
+                except IndexError:
+                    raise ValueError(
+                        f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
+                    )
+
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -1330,41 +1342,18 @@ def __call__(
                     continue
 
                 # cfg
-                if self.do_classifier_free_guidance and not self.do_adversarial_guidance:
+                if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance:
                     latent_model_input = torch.cat([latents] * 2)
                 # pag
-                elif not self.do_classifier_free_guidance and self.do_adversarial_guidance:
+                elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
                     latent_model_input = torch.cat([latents] * 2)
                 # both
-                elif self.do_classifier_free_guidance and self.do_adversarial_guidance:
+                elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
                     latent_model_input = torch.cat([latents] * 3)
                 # no
                 else:
                     latent_model_input = latents
 
-                # change attention layer in UNet if use PAG
-                if self.do_adversarial_guidance:
-                    if self.do_classifier_free_guidance:
-                        replace_processor = PAGCFGIdentitySelfAttnProcessor()
-                    else:
-                        replace_processor = PAGIdentitySelfAttnProcessor()
-
-                    drop_layers = self.pag_applied_layers_index
-                    for drop_layer in drop_layers:
-                        try:
-                            if drop_layer[0] == "d":
-                                down_layers[int(drop_layer[1])].processor = replace_processor
-                            elif drop_layer[0] == "m":
-                                mid_layers[int(drop_layer[1])].processor = replace_processor
-                            elif drop_layer[0] == "u":
-                                up_layers[int(drop_layer[1])].processor = replace_processor
-                            else:
-                                raise ValueError(f"Invalid layer type: {drop_layer[0]}")
-                        except IndexError:
-                            raise ValueError(
-                                f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
-                            )
-
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
@@ -1381,14 +1370,14 @@ def __call__(
                 # perform guidance
 
                 # cfg
-                if self.do_classifier_free_guidance and not self.do_adversarial_guidance:
+                if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
 
                     delta = noise_pred_text - noise_pred_uncond
                     noise_pred = noise_pred_uncond + self.guidance_scale * delta
 
                 # pag
-                elif not self.do_classifier_free_guidance and self.do_adversarial_guidance:
+                elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
                     noise_pred_original, noise_pred_perturb = noise_pred.chunk(2)
 
                     signal_scale = self.pag_scale
@@ -1400,7 +1389,7 @@ def __call__(
                     noise_pred = noise_pred_original + signal_scale * (noise_pred_original - noise_pred_perturb)
 
                 # both
-                elif self.do_classifier_free_guidance and self.do_adversarial_guidance:
+                elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
                     noise_pred_uncond, noise_pred_text, noise_pred_text_perturb = noise_pred.chunk(3)
 
                     signal_scale = self.pag_scale
@@ -1458,11 +1447,8 @@ def __call__(
         # Offload all models
         self.maybe_free_model_hooks()
 
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
         # change attention layer in UNet if use PAG
-        if self.do_adversarial_guidance:
+        if self.do_perturbed_attention_guidance:
             drop_layers = self.pag_applied_layers_index
             for drop_layer in drop_layers:
                 try:
@@ -1479,4 +1465,7 @@ def __call__(
                         f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
                     )
 
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/examples/community/stable_diffusion_controlnet_img2img.py b/examples/community/stable_diffusion_controlnet_img2img.py
index 5f9083616a84..74674e65f0ef 100644
--- a/examples/community/stable_diffusion_controlnet_img2img.py
+++ b/examples/community/stable_diffusion_controlnet_img2img.py
@@ -802,15 +802,16 @@ def __call__(
         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
 
         # 6. Prepare latent variables
-        latents = self.prepare_latents(
-            image,
-            latent_timestep,
-            batch_size,
-            num_images_per_prompt,
-            prompt_embeds.dtype,
-            device,
-            generator,
-        )
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+            )
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
diff --git a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
index d056eb112165..14c4e4aa6d4e 100644
--- a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
+++ b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
@@ -907,15 +907,16 @@ def __call__(
         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
 
         # 6. Prepare latent variables
-        latents = self.prepare_latents(
-            image,
-            latent_timestep,
-            batch_size,
-            num_images_per_prompt,
-            prompt_embeds.dtype,
-            device,
-            generator,
-        )
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+            )
 
         mask_image_latents = self.prepare_mask_latents(
             mask_image,
diff --git a/examples/consistency_distillation/README_sdxl.md b/examples/consistency_distillation/README_sdxl.md
index d3abaa4ce175..6bd84727cf31 100644
--- a/examples/consistency_distillation/README_sdxl.md
+++ b/examples/consistency_distillation/README_sdxl.md
@@ -115,11 +115,11 @@ accelerate launch train_lcm_distill_lora_sdxl_wds.py \
 
 We provide another version for LCM LoRA SDXL that follows best practices of `peft` and leverages the `datasets` library for quick experimentation. The script doesn't load two UNets unlike `train_lcm_distill_lora_sdxl_wds.py` which reduces the memory requirements quite a bit. 
 
-Below is an example training command that trains an LCM LoRA on the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions):
+Below is an example training command that trains an LCM LoRA on the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions):
 
 ```bash
 export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 export VAE_PATH="madebyollin/sdxl-vae-fp16-fix"
 
 accelerate launch train_lcm_distill_lora_sdxl.py \
diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py b/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
index 9405c238f937..56f83f47b84c 100644
--- a/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
+++ b/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
@@ -71,7 +71,7 @@
 logger = get_logger(__name__)
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/examples/controlnet/train_controlnet_sdxl.py b/examples/controlnet/train_controlnet_sdxl.py
index 62192521a323..288a1e3fb612 100644
--- a/examples/controlnet/train_controlnet_sdxl.py
+++ b/examples/controlnet/train_controlnet_sdxl.py
@@ -32,7 +32,7 @@
 import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
-from accelerate.utils import ProjectConfiguration, set_seed
+from accelerate.utils import DistributedType, ProjectConfiguration, set_seed
 from datasets import load_dataset
 from huggingface_hub import create_repo, upload_folder
 from packaging import version
@@ -53,7 +53,7 @@
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
-from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
 from diffusers.utils.torch_utils import is_compiled_module
 
 
@@ -64,6 +64,8 @@
 check_min_version("0.28.0.dev0")
 
 logger = get_logger(__name__)
+if is_torch_npu_available():
+    torch.npu.config.allow_internal_format = False
 
 
 def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step, is_final_validation=False):
@@ -471,6 +473,9 @@ def parse_args(input_args=None):
     parser.add_argument(
         "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
     )
+    parser.add_argument(
+        "--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention."
+    )
     parser.add_argument(
         "--set_grads_to_none",
         action="store_true",
@@ -936,6 +941,13 @@ def load_model_hook(models, input_dir):
     text_encoder_two.requires_grad_(False)
     controlnet.train()
 
+    if args.enable_npu_flash_attention:
+        if is_torch_npu_available():
+            logger.info("npu flash attention enabled.")
+            unet.enable_npu_flash_attention()
+        else:
+            raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.")
+
     if args.enable_xformers_memory_efficient_attention:
         if is_xformers_available():
             import xformers
@@ -1235,7 +1247,8 @@ def compute_embeddings(batch, proportion_empty_prompts, text_encoders, tokenizer
                 progress_bar.update(1)
                 global_step += 1
 
-                if accelerator.is_main_process:
+                # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
+                if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
                     if global_step % args.checkpointing_steps == 0:
                         # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
                         if args.checkpoints_total_limit is not None:
diff --git a/examples/kandinsky2_2/text_to_image/README.md b/examples/kandinsky2_2/text_to_image/README.md
index 6e5a1835593f..d27ba1a21f0c 100644
--- a/examples/kandinsky2_2/text_to_image/README.md
+++ b/examples/kandinsky2_2/text_to_image/README.md
@@ -57,7 +57,7 @@ To disable wandb logging, remove the `--report_to=="wandb"` and `--validation_pr
 
 <!-- accelerate_snippet_start -->
 ```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch --mixed_precision="fp16"  train_text_to_image_decoder.py \
   --dataset_name=$DATASET_NAME \
@@ -139,7 +139,7 @@ You can fine-tune the Kandinsky prior model with `train_text_to_image_prior.py`
 
 <!-- accelerate_snippet_start -->
 ```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch --mixed_precision="fp16"  train_text_to_image_prior.py \
   --dataset_name=$DATASET_NAME \
@@ -183,7 +183,7 @@ If you want to use a fine-tuned decoder checkpoint along with your fine-tuned pr
 for running distributed training with `accelerate`. Here is an example command:
 
 ```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch --mixed_precision="fp16" --multi_gpu  train_text_to_image_decoder.py \
   --dataset_name=$DATASET_NAME \
@@ -227,13 +227,13 @@ on consumer GPUs like Tesla T4, Tesla V100.
 
 ### Training
 
-First, you need to set up your development environment as explained in the [installation](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).  
+First, you need to set up your development environment as explained in the [installation](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).  
 
 
 #### Train decoder 
 
 ```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch --mixed_precision="fp16" train_text_to_image_decoder_lora.py \
   --dataset_name=$DATASET_NAME --caption_column="text" \
@@ -252,7 +252,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_decoder_lora.py \
 #### Train prior
 
 ```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch --mixed_precision="fp16" train_text_to_image_prior_lora.py \
   --dataset_name=$DATASET_NAME --caption_column="text" \
diff --git a/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py b/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py
index e169cf92beb9..f6f3896aaa12 100644
--- a/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py
+++ b/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py
@@ -332,7 +332,7 @@ def parse_args():
 
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py b/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py
index bd95aed2939c..54a4d0a397b4 100644
--- a/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py
+++ b/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py
@@ -56,7 +56,7 @@
 logger = get_logger(__name__, log_level="INFO")
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/examples/research_projects/lora/README.md b/examples/research_projects/lora/README.md
index b5d72403166f..14cd6cd9be56 100644
--- a/examples/research_projects/lora/README.md
+++ b/examples/research_projects/lora/README.md
@@ -19,7 +19,7 @@ on consumer GPUs like Tesla T4, Tesla V100.
 
 ### Training
 
-First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).  
+First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).  
 
 **___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
 
@@ -27,7 +27,7 @@ First, you need to set up your development environment as is explained in the [i
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 ```
 
 For this example we want to directly store the trained LoRA embeddings on the Hub, so 
diff --git a/examples/research_projects/lora/train_text_to_image_lora.py b/examples/research_projects/lora/train_text_to_image_lora.py
index cf00bf270057..1ebc1422b064 100644
--- a/examples/research_projects/lora/train_text_to_image_lora.py
+++ b/examples/research_projects/lora/train_text_to_image_lora.py
@@ -387,7 +387,7 @@ def parse_args():
 
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/examples/research_projects/onnxruntime/text_to_image/README.md b/examples/research_projects/onnxruntime/text_to_image/README.md
index 48bce2065444..8b499795746c 100644
--- a/examples/research_projects/onnxruntime/text_to_image/README.md
+++ b/examples/research_projects/onnxruntime/text_to_image/README.md
@@ -55,7 +55,7 @@ The command to train a DDPM UNetCondition model on the Pokemon dataset with onnx
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export dataset_name="lambdalabs/pokemon-blip-captions"
+export dataset_name="lambdalabs/naruto-blip-captions"
 accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
   --dataset_name=$dataset_name \
diff --git a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
index ee61f033d34d..126a10b4f9e9 100644
--- a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
+++ b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
@@ -59,7 +59,7 @@
 logger = get_logger(__name__, log_level="INFO")
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py
index 0f4cc6c50b5e..d3bf95305dad 100644
--- a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py
+++ b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py
@@ -61,7 +61,7 @@
 logger = get_logger(__name__, log_level="INFO")
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py
index f22519b02e2b..a4b4d69bb892 100644
--- a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py
+++ b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py
@@ -406,7 +406,7 @@ def parse_args():
 
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py
index e5ff9d39e8ba..d7f2dcaa3442 100644
--- a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py
+++ b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py
@@ -468,7 +468,7 @@ def parse_args(input_args=None):
 
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py
index 1dac573fce4c..a056bcfc8cb1 100644
--- a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py
+++ b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py
@@ -60,7 +60,7 @@
 
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md
index f2931d3f347e..9a8410604878 100644
--- a/examples/text_to_image/README.md
+++ b/examples/text_to_image/README.md
@@ -57,7 +57,7 @@ With `gradient_checkpointing` and `mixed_precision` it should be possible to fin
 <!-- accelerate_snippet_start -->
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
@@ -136,7 +136,7 @@ for running distributed training with `accelerate`. Here is an example command:
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch --mixed_precision="fp16" --multi_gpu  train_text_to_image.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
@@ -170,6 +170,11 @@ For our small Pokemons dataset, the effects of Min-SNR weighting strategy might
 
 Also, note that in this example, we either predict `epsilon` (i.e., the noise) or the `v_prediction`. For both of these cases, the formulation of the Min-SNR weighting strategy that we have used holds.
 
+#### Training with DREAM
+
+We support training epsilon (noise) prediction models using the [DREAM (Diffusion Rectification and Estimation-Adaptive Models) strategy](https://arxiv.org/abs/2312.00210). DREAM claims to increase model fidelity for the performance cost of an extra grad-less unet `forward` step in the training loop.  You can turn on DREAM training by using the `--dream_training` argument. The `--dream_detail_preservation` argument controls the detail preservation variable p and is the default of 1 from the paper.
+
+
 ## Training with LoRA
 
 Low-Rank Adaption of Large Language Models was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*.
@@ -187,7 +192,7 @@ on consumer GPUs like Tesla T4, Tesla V100.
 
 ### Training
 
-First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).
+First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).
 
 **___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
 
@@ -195,7 +200,7 @@ First, you need to set up your development environment as is explained in the [i
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 ```
 
 For this example we want to directly store the trained LoRA embeddings on the Hub, so
@@ -277,7 +282,7 @@ pip install -U -r requirements_flax.txt
 
 ```bash
 export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 python train_text_to_image_flax.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
diff --git a/examples/text_to_image/README_sdxl.md b/examples/text_to_image/README_sdxl.md
index 349feef5008e..35ea0091c4f3 100644
--- a/examples/text_to_image/README_sdxl.md
+++ b/examples/text_to_image/README_sdxl.md
@@ -52,7 +52,7 @@ Note also that we use PEFT library as backend for LoRA training, make sure to ha
 ```bash
 export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
 export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch train_text_to_image_sdxl.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
@@ -76,7 +76,7 @@ accelerate launch train_text_to_image_sdxl.py \
 
 **Notes**:
 
-*  The `train_text_to_image_sdxl.py` script pre-computes text embeddings and the VAE encodings and keeps them in memory. While for smaller datasets like [`lambdalabs/pokemon-blip-captions`](https://hf.co/datasets/lambdalabs/pokemon-blip-captions), it might not be a problem, it can definitely lead to memory problems when the script is used on a larger dataset. For those purposes, you would want to serialize these pre-computed representations to disk separately and load them during the fine-tuning process. Refer to [this PR](https://github.com/huggingface/diffusers/pull/4505) for a more in-depth discussion.
+*  The `train_text_to_image_sdxl.py` script pre-computes text embeddings and the VAE encodings and keeps them in memory. While for smaller datasets like [`lambdalabs/naruto-blip-captions`](https://hf.co/datasets/lambdalabs/naruto-blip-captions), it might not be a problem, it can definitely lead to memory problems when the script is used on a larger dataset. For those purposes, you would want to serialize these pre-computed representations to disk separately and load them during the fine-tuning process. Refer to [this PR](https://github.com/huggingface/diffusers/pull/4505) for a more in-depth discussion.
 * The training script is compute-intensive and may not run on a consumer GPU like Tesla T4.
 * The training command shown above performs intermediate quality validation in between the training epochs and logs the results to Weights and Biases. `--report_to`, `--validation_prompt`, and `--validation_epochs` are the relevant CLI arguments here.
 * SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
@@ -142,14 +142,14 @@ on consumer GPUs like Tesla T4, Tesla V100.
 
 ### Training
 
-First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables and, optionally, the `VAE_NAME` variable. Here, we will use [Stable Diffusion XL 1.0-base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).
+First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables and, optionally, the `VAE_NAME` variable. Here, we will use [Stable Diffusion XL 1.0-base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).
 
 **___Note: It is quite useful to monitor the training progress by regularly generating sample images during training. [Weights and Biases](https://docs.wandb.ai/quickstart) is a nice solution to easily see generating images during training. All you need to do is to run `pip install wandb` before training to automatically log images.___**
 
 ```bash
 export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
 export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 ```
 
 For this example we want to directly store the trained LoRA embeddings on the Hub, so
@@ -219,7 +219,7 @@ You need to save the mentioned configuration as an `accelerate_config.yaml` file
 ```shell
 export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
 export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 export ACCELERATE_CONFIG_FILE="your accelerate_config.yaml"
 
 accelerate launch  --config_file $ACCELERATE_CONFIG_FILE train_text_to_image_lora_sdxl.py \
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index 84f4c6514cfd..13ee0f2cc4c7 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -45,7 +45,7 @@
 import diffusers
 from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
-from diffusers.training_utils import EMAModel, compute_snr
+from diffusers.training_utils import EMAModel, compute_dream_and_update_latents, compute_snr
 from diffusers.utils import check_min_version, deprecate, is_wandb_available, make_image_grid
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
 from diffusers.utils.import_utils import is_xformers_available
@@ -62,7 +62,7 @@
 logger = get_logger(__name__, log_level="INFO")
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
@@ -361,6 +361,20 @@ def parse_args():
         help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
         "More details here: https://arxiv.org/abs/2303.09556.",
     )
+    parser.add_argument(
+        "--dream_training",
+        action="store_true",
+        help=(
+            "Use the DREAM training method, which makes training more efficient and accurate at the ",
+            "expense of doing an extra forward pass. See: https://arxiv.org/abs/2312.00210",
+        ),
+    )
+    parser.add_argument(
+        "--dream_detail_preservation",
+        type=float,
+        default=1.0,
+        help="Dream detail preservation factor p (should be greater than 0; default=1.0, as suggested in the paper)",
+    )
     parser.add_argument(
         "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
     )
@@ -948,6 +962,18 @@ def unwrap_model(model):
                 else:
                     raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
 
+                if args.dream_training:
+                    noisy_latents, target = compute_dream_and_update_latents(
+                        unet,
+                        noise_scheduler,
+                        timesteps,
+                        noise,
+                        noisy_latents,
+                        target,
+                        encoder_hidden_states,
+                        args.dream_detail_preservation,
+                    )
+
                 # Predict the noise residual and compute loss
                 model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
 
diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py
index 557923c52e00..c3a08a90b4e5 100644
--- a/examples/text_to_image/train_text_to_image_flax.py
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -250,7 +250,7 @@ def parse_args():
 
 
 dataset_name_mapping = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py
index 7164ac909cb2..37b10cfd1bad 100644
--- a/examples/text_to_image/train_text_to_image_lora.py
+++ b/examples/text_to_image/train_text_to_image_lora.py
@@ -387,7 +387,7 @@ def parse_args():
 
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py
index 0a6a70de2dc7..c9883252d14b 100644
--- a/examples/text_to_image/train_text_to_image_lora_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py
@@ -32,7 +32,7 @@
 import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
-from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from accelerate.utils import DistributedDataParallelKwargs, DistributedType, ProjectConfiguration, set_seed
 from datasets import load_dataset
 from huggingface_hub import create_repo, upload_folder
 from packaging import version
@@ -60,7 +60,7 @@
     is_wandb_available,
 )
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
-from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
 from diffusers.utils.torch_utils import is_compiled_module
 
 
@@ -68,6 +68,8 @@
 check_min_version("0.28.0.dev0")
 
 logger = get_logger(__name__)
+if is_torch_npu_available():
+    torch.npu.config.allow_internal_format = False
 
 
 def save_model_card(
@@ -419,6 +421,9 @@ def parse_args(input_args=None):
     parser.add_argument(
         "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
     )
+    parser.add_argument(
+        "--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention."
+    )
     parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
     parser.add_argument(
         "--rank",
@@ -449,7 +454,7 @@ def parse_args(input_args=None):
 
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
@@ -623,6 +628,13 @@ def main(args):
     text_encoder_one.to(accelerator.device, dtype=weight_dtype)
     text_encoder_two.to(accelerator.device, dtype=weight_dtype)
 
+    if args.enable_npu_flash_attention:
+        if is_torch_npu_available():
+            logger.info("npu flash attention enabled.")
+            unet.enable_npu_flash_attention()
+        else:
+            raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.")
+
     if args.enable_xformers_memory_efficient_attention:
         if is_xformers_available():
             import xformers
@@ -1149,7 +1161,8 @@ def compute_time_ids(original_size, crops_coords_top_left):
                 accelerator.log({"train_loss": train_loss}, step=global_step)
                 train_loss = 0.0
 
-                if accelerator.is_main_process:
+                # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
+                if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
                     if global_step % args.checkpointing_steps == 0:
                         # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
                         if args.checkpoints_total_limit is not None:
diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py
index 88adbb995531..90602ad597a9 100644
--- a/examples/text_to_image/train_text_to_image_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_sdxl.py
@@ -61,7 +61,7 @@
 
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/examples/wuerstchen/text_to_image/README.md b/examples/wuerstchen/text_to_image/README.md
index d655259088e4..7583296e66d1 100644
--- a/examples/wuerstchen/text_to_image/README.md
+++ b/examples/wuerstchen/text_to_image/README.md
@@ -37,7 +37,7 @@ You can fine-tune the Würstchen prior model with the `train_text_to_image_prior
 
 <!-- accelerate_snippet_start -->
 ```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch  train_text_to_image_prior.py \
   --mixed_precision="fp16" \
@@ -72,10 +72,10 @@ In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-de
 
 ### Prior Training
 
-First, you need to set up your development environment as explained in the [installation](#Running-locally-with-PyTorch) section. Make sure to set the `DATASET_NAME` environment variable. Here, we will use the [Pokemon captions dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).
+First, you need to set up your development environment as explained in the [installation](#Running-locally-with-PyTorch) section. Make sure to set the `DATASET_NAME` environment variable. Here, we will use the [Pokemon captions dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).
 
 ```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
 
 accelerate launch train_text_to_image_lora_prior.py \
   --mixed_precision="fp16" \
diff --git a/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py b/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py
index 76eaf6423960..79f7d8576ff4 100644
--- a/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py
+++ b/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py
@@ -55,7 +55,7 @@
 logger = get_logger(__name__, log_level="INFO")
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/examples/wuerstchen/text_to_image/train_text_to_image_prior.py b/examples/wuerstchen/text_to_image/train_text_to_image_prior.py
index 49cc5d26072d..3e0acfdaf519 100644
--- a/examples/wuerstchen/text_to_image/train_text_to_image_prior.py
+++ b/examples/wuerstchen/text_to_image/train_text_to_image_prior.py
@@ -56,7 +56,7 @@
 logger = get_logger(__name__, log_level="INFO")
 
 DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+    "lambdalabs/naruto-blip-captions": ("image", "text"),
 }
 
 
diff --git a/scripts/convert_animatediff_motion_lora_to_diffusers.py b/scripts/convert_animatediff_motion_lora_to_diffusers.py
index 509a7345793c..c680fdc68462 100644
--- a/scripts/convert_animatediff_motion_lora_to_diffusers.py
+++ b/scripts/convert_animatediff_motion_lora_to_diffusers.py
@@ -1,7 +1,7 @@
 import argparse
 
 import torch
-from safetensors.torch import save_file
+from safetensors.torch import load_file, save_file
 
 
 def convert_motion_module(original_state_dict):
@@ -34,7 +34,10 @@ def get_args():
 if __name__ == "__main__":
     args = get_args()
 
-    state_dict = torch.load(args.ckpt_path, map_location="cpu")
+    if args.ckpt_path.endswith(".safetensors"):
+        state_dict = load_file(args.ckpt_path)
+    else:
+        state_dict = torch.load(args.ckpt_path, map_location="cpu")
 
     if "state_dict" in state_dict.keys():
         state_dict = state_dict["state_dict"]
diff --git a/scripts/convert_animatediff_motion_module_to_diffusers.py b/scripts/convert_animatediff_motion_module_to_diffusers.py
index ceb967acd3d6..e188a6a533e8 100644
--- a/scripts/convert_animatediff_motion_module_to_diffusers.py
+++ b/scripts/convert_animatediff_motion_module_to_diffusers.py
@@ -1,6 +1,7 @@
 import argparse
 
 import torch
+from safetensors.torch import load_file
 
 from diffusers import MotionAdapter
 
@@ -30,6 +31,7 @@ def get_args():
     parser.add_argument("--output_path", type=str, required=True)
     parser.add_argument("--use_motion_mid_block", action="store_true")
     parser.add_argument("--motion_max_seq_length", type=int, default=32)
+    parser.add_argument("--block_out_channels", nargs="+", default=[320, 640, 1280, 1280], type=int)
     parser.add_argument("--save_fp16", action="store_true")
 
     return parser.parse_args()
@@ -38,17 +40,23 @@ def get_args():
 if __name__ == "__main__":
     args = get_args()
 
-    state_dict = torch.load(args.ckpt_path, map_location="cpu")
+    if args.ckpt_path.endswith(".safetensors"):
+        state_dict = load_file(args.ckpt_path)
+    else:
+        state_dict = torch.load(args.ckpt_path, map_location="cpu")
+
     if "state_dict" in state_dict.keys():
         state_dict = state_dict["state_dict"]
 
     conv_state_dict = convert_motion_module(state_dict)
     adapter = MotionAdapter(
-        use_motion_mid_block=args.use_motion_mid_block, motion_max_seq_length=args.motion_max_seq_length
+        block_out_channels=args.block_out_channels,
+        use_motion_mid_block=args.use_motion_mid_block,
+        motion_max_seq_length=args.motion_max_seq_length,
     )
     # skip loading position embeddings
     adapter.load_state_dict(conv_state_dict, strict=False)
     adapter.save_pretrained(args.output_path)
 
     if args.save_fp16:
-        adapter.to(torch.float16).save_pretrained(args.output_path, variant="fp16")
+        adapter.to(dtype=torch.float16).save_pretrained(args.output_path, variant="fp16")
diff --git a/scripts/convert_zero123_to_diffusers.py b/scripts/convert_zero123_to_diffusers.py
index 669a4962be3c..b46633fae7ff 100644
--- a/scripts/convert_zero123_to_diffusers.py
+++ b/scripts/convert_zero123_to_diffusers.py
@@ -113,7 +113,7 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
             assert "adm_in_channels" in unet_params
             projection_class_embeddings_input_dim = unet_params["adm_in_channels"]
         else:
-            raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params["num_classes"]}")
+            raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params['num_classes']}")
 
     config = {
         "sample_size": image_size // vae_scale_factor,
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index d6aa1e11b6d2..f7fa82157c52 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -216,6 +216,7 @@
             "AmusedInpaintPipeline",
             "AmusedPipeline",
             "AnimateDiffPipeline",
+            "AnimateDiffSDXLPipeline",
             "AnimateDiffVideoToVideoPipeline",
             "AudioLDM2Pipeline",
             "AudioLDM2ProjectionModel",
@@ -595,6 +596,7 @@
             AmusedInpaintPipeline,
             AmusedPipeline,
             AnimateDiffPipeline,
+            AnimateDiffSDXLPipeline,
             AnimateDiffVideoToVideoPipeline,
             AudioLDM2Pipeline,
             AudioLDM2ProjectionModel,
diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index 8f4a1958975d..7d76687a3d1e 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -310,9 +310,9 @@ def load_config(
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -341,7 +341,7 @@ def load_config(
         """
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         token = kwargs.pop("token", None)
         local_files_only = kwargs.pop("local_files_only", False)
diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 4ccb9d77d627..0f4481570829 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -80,7 +80,6 @@ def __init__(
                 " if you intended to convert the image into RGB format, please set `do_convert_grayscale = False`.",
                 " if you intended to convert the image into grayscale format, please set `do_convert_rgb = False`",
             )
-            self.config.do_convert_rgb = False
 
     @staticmethod
     def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py
index b91d27f7d63e..36b022a26ec9 100644
--- a/src/diffusers/loaders/autoencoder.py
+++ b/src/diffusers/loaders/autoencoder.py
@@ -50,9 +50,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                 is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -99,7 +99,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
 
         original_config_file = kwargs.pop("original_config_file", None)
         config_file = kwargs.pop("config_file", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
         token = kwargs.pop("token", None)
diff --git a/src/diffusers/loaders/controlnet.py b/src/diffusers/loaders/controlnet.py
index d323f60aa7ee..53b9802d390e 100644
--- a/src/diffusers/loaders/controlnet.py
+++ b/src/diffusers/loaders/controlnet.py
@@ -50,9 +50,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                 is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -89,7 +89,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         """
         original_config_file = kwargs.pop("original_config_file", None)
         config_file = kwargs.pop("config_file", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
         token = kwargs.pop("token", None)
diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py
index 28a4334b199c..ef6a53e43196 100644
--- a/src/diffusers/loaders/ip_adapter.py
+++ b/src/diffusers/loaders/ip_adapter.py
@@ -90,9 +90,9 @@ def load_ip_adapter(
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -135,7 +135,7 @@ def load_ip_adapter(
         # Load the main state dict first.
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", None)
         token = kwargs.pop("token", None)
diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora.py
index 5d89658830f1..24be4999de27 100644
--- a/src/diffusers/loaders/lora.py
+++ b/src/diffusers/loaders/lora.py
@@ -176,9 +176,9 @@ def lora_state_dict(
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -208,7 +208,7 @@ def lora_state_dict(
         # UNet and text encoder or both.
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", None)
         token = kwargs.pop("token", None)
@@ -369,7 +369,11 @@ def _optionally_disable_offloading(cls, _pipeline):
                     if not is_model_cpu_offload:
                         is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
                     if not is_sequential_cpu_offload:
-                        is_sequential_cpu_offload = isinstance(component._hf_hook, AlignDevicesHook)
+                        is_sequential_cpu_offload = (
+                            isinstance(component._hf_hook, AlignDevicesHook)
+                            or hasattr(component._hf_hook, "hooks")
+                            and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+                        )
 
                     logger.info(
                         "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
@@ -1268,9 +1272,10 @@ def set_lora_device(self, adapter_names: List[str], device: Union[torch.device,
                     unet_module.lora_A[adapter_name].to(device)
                     unet_module.lora_B[adapter_name].to(device)
                     # this is a param, not a module, so device placement is not in-place -> re-assign
-                    unet_module.lora_magnitude_vector[adapter_name] = unet_module.lora_magnitude_vector[
-                        adapter_name
-                    ].to(device)
+                    if hasattr(unet_module, "lora_magnitude_vector") and unet_module.lora_magnitude_vector is not None:
+                        unet_module.lora_magnitude_vector[adapter_name] = unet_module.lora_magnitude_vector[
+                            adapter_name
+                        ].to(device)
 
         # Handle the text encoder
         modules_to_process = []
@@ -1288,9 +1293,13 @@ def set_lora_device(self, adapter_names: List[str], device: Union[torch.device,
                         text_encoder_module.lora_A[adapter_name].to(device)
                         text_encoder_module.lora_B[adapter_name].to(device)
                         # this is a param, not a module, so device placement is not in-place -> re-assign
-                        text_encoder_module.lora_magnitude_vector[
-                            adapter_name
-                        ] = text_encoder_module.lora_magnitude_vector[adapter_name].to(device)
+                        if (
+                            hasattr(text_encoder, "lora_magnitude_vector")
+                            and text_encoder_module.lora_magnitude_vector is not None
+                        ):
+                            text_encoder_module.lora_magnitude_vector[
+                                adapter_name
+                            ] = text_encoder_module.lora_magnitude_vector[adapter_name].to(device)
 
 
 class StableDiffusionXLLoraLoaderMixin(LoraLoaderMixin):
@@ -1397,6 +1406,9 @@ def save_lora_weights(
             text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
                 State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text
                 encoder LoRA state dict because it comes from 🤗 Transformers.
+            text_encoder_2_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `text_encoder_2`. Must explicitly pass the text
+                encoder LoRA state dict because it comes from 🤗 Transformers.
             is_main_process (`bool`, *optional*, defaults to `True`):
                 Whether the process calling this is the main process or not. Useful during distributed training and you
                 need to call this function on all processes. In this case, set `is_main_process=True` only on the main
@@ -1423,8 +1435,10 @@ def pack_weights(layers, prefix):
         if unet_lora_layers:
             state_dict.update(pack_weights(unet_lora_layers, "unet"))
 
-        if text_encoder_lora_layers and text_encoder_2_lora_layers:
+        if text_encoder_lora_layers:
             state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
+
+        if text_encoder_2_lora_layers:
             state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
 
         cls.write_lora_layers(
diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 752ef18c7a0b..d8ff92d0b0ff 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -177,9 +177,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                 is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -244,7 +244,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
         ```
         """
         original_config_file = kwargs.pop("original_config_file", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
         token = kwargs.pop("token", None)
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 5b01b8da2b1f..c23e594c3976 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -305,7 +305,7 @@ def fetch_ldm_config_and_checkpoint(
     pretrained_model_link_or_path,
     class_name,
     original_config_file=None,
-    resume_download=False,
+    resume_download=None,
     force_download=False,
     proxies=None,
     token=None,
diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py
index c1c224975cb8..a9b9a9aae052 100644
--- a/src/diffusers/loaders/textual_inversion.py
+++ b/src/diffusers/loaders/textual_inversion.py
@@ -38,7 +38,7 @@
 def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs):
     cache_dir = kwargs.pop("cache_dir", None)
     force_download = kwargs.pop("force_download", False)
-    resume_download = kwargs.pop("resume_download", False)
+    resume_download = kwargs.pop("resume_download", None)
     proxies = kwargs.pop("proxies", None)
     local_files_only = kwargs.pop("local_files_only", None)
     token = kwargs.pop("token", None)
@@ -308,9 +308,9 @@ def load_textual_inversion(
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -423,7 +423,11 @@ def load_textual_inversion(
             if isinstance(component, nn.Module):
                 if hasattr(component, "_hf_hook"):
                     is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    is_sequential_cpu_offload = (
+                        isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                        or hasattr(component._hf_hook, "hooks")
+                        and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+                    )
                     logger.info(
                         "Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again."
                     )
diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
index 294db44ee61d..5d5ed30dc35f 100644
--- a/src/diffusers/loaders/unet.py
+++ b/src/diffusers/loaders/unet.py
@@ -103,9 +103,9 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -149,7 +149,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
 
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", None)
         token = kwargs.pop("token", None)
@@ -359,7 +359,11 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
                 for _, component in _pipeline.components.items():
                     if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
                         is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                        is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                        is_sequential_cpu_offload = (
+                            isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                            or hasattr(component._hf_hook, "hooks")
+                            and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+                        )
 
                         logger.info(
                             "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
@@ -1086,9 +1090,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                 is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -1110,7 +1114,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
             raise ValueError("FromOriginalUNetMixin is currently only compatible with StableCascadeUNet")
 
         config = kwargs.pop("config", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
         token = kwargs.pop("token", None)
diff --git a/src/diffusers/models/activations.py b/src/diffusers/models/activations.py
index cec83bdded9e..f94b6c8d6d06 100644
--- a/src/diffusers/models/activations.py
+++ b/src/diffusers/models/activations.py
@@ -18,8 +18,12 @@
 from torch import nn
 
 from ..utils import deprecate
+from ..utils.import_utils import is_torch_npu_available
 
 
+if is_torch_npu_available():
+    import torch_npu
+
 ACTIVATION_FUNCTIONS = {
     "swish": nn.SiLU(),
     "silu": nn.SiLU(),
@@ -98,9 +102,13 @@ def forward(self, hidden_states, *args, **kwargs):
         if len(args) > 0 or kwargs.get("scale", None) is not None:
             deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
             deprecate("scale", "1.0.0", deprecation_message)
-
-        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
-        return hidden_states * self.gelu(gate)
+        hidden_states = self.proj(hidden_states)
+        if is_torch_npu_available():
+            # using torch_npu.npu_geglu can run faster and save memory on NPU.
+            return torch_npu.npu_geglu(hidden_states, dim=-1, approximate=1)[0]
+        else:
+            hidden_states, gate = hidden_states.chunk(2, dim=-1)
+            return hidden_states * self.gelu(gate)
 
 
 class ApproximateGELU(nn.Module):
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 429807989296..ea1c987e95c6 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
+import math
 from importlib import import_module
 from typing import Callable, List, Optional, Union
 
@@ -21,13 +22,15 @@
 
 from ..image_processor import IPAdapterMaskProcessor
 from ..utils import deprecate, logging
-from ..utils.import_utils import is_xformers_available
+from ..utils.import_utils import is_torch_npu_available, is_xformers_available
 from ..utils.torch_utils import maybe_allow_in_graph
 from .lora import LoRALinearLayer
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+if is_torch_npu_available():
+    import torch_npu
 
 if is_xformers_available():
     import xformers
@@ -209,6 +212,23 @@ def __init__(
             )
         self.set_processor(processor)
 
+    def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None:
+        r"""
+        Set whether to use npu flash attention from `torch_npu` or not.
+
+        """
+        if use_npu_flash_attention:
+            processor = AttnProcessorNPU()
+        else:
+            # set attention processor
+            # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+            # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+            # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+            processor = (
+                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+        self.set_processor(processor)
+
     def set_use_memory_efficient_attention_xformers(
         self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
     ) -> None:
@@ -1207,6 +1227,116 @@ def __call__(
         return hidden_states
 
 
+class AttnProcessorNPU:
+
+    r"""
+    Processor for implementing flash attention using torch_npu. Torch_npu supports only fp16 and bf16 data types. If
+    fp32 is used, F.scaled_dot_product_attention will be used for computation, but the acceleration effect on NPU is
+    not significant.
+
+    """
+
+    def __init__(self):
+        if not is_torch_npu_available():
+            raise ImportError("AttnProcessorNPU requires torch_npu extensions and is supported only on npu devices.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        if query.dtype in (torch.float16, torch.bfloat16):
+            hidden_states = torch_npu.npu_fusion_attention(
+                query,
+                key,
+                value,
+                attn.heads,
+                input_layout="BNSD",
+                pse=None,
+                atten_mask=attention_mask,
+                scale=1.0 / math.sqrt(query.shape[-1]),
+                pre_tockens=65536,
+                next_tockens=65536,
+                keep_prob=1.0,
+                sync=False,
+                inner_precise=0,
+            )[0]
+        else:
+            # TODO: add support for attn.scale when we move to Torch 2.1
+            hidden_states = F.scaled_dot_product_attention(
+                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
 class AttnProcessor2_0:
     r"""
     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py
index b286453de424..0b9b9d4d47e5 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -65,6 +65,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
     """
 
     _supports_gradient_checkpointing = True
+    _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/modeling_flax_utils.py b/src/diffusers/models/modeling_flax_utils.py
index 1ddcda9005fc..151281070faa 100644
--- a/src/diffusers/models/modeling_flax_utils.py
+++ b/src/diffusers/models/modeling_flax_utils.py
@@ -245,9 +245,9 @@ def from_pretrained(
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -296,7 +296,7 @@ def from_pretrained(
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
         from_pt = kwargs.pop("from_pt", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", False)
         token = kwargs.pop("token", None)
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index c1fdff8ab356..373f5453aa23 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -57,7 +57,8 @@
 
 if is_accelerate_available():
     import accelerate
-    from accelerate.utils import set_module_tensor_to_device
+    from accelerate import infer_auto_device_map
+    from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device
     from accelerate.utils.versions import is_torch_version
 
 
@@ -99,6 +100,29 @@ def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
         return first_tuple[1].dtype
 
 
+# Adapted from `transformers` (see modeling_utils.py)
+def _determine_device_map(model: "ModelMixin", device_map, max_memory, torch_dtype):
+    if isinstance(device_map, str):
+        no_split_modules = model._get_no_split_modules(device_map)
+        device_map_kwargs = {"no_split_module_classes": no_split_modules}
+
+        if device_map != "sequential":
+            max_memory = get_balanced_memory(
+                model,
+                dtype=torch_dtype,
+                low_zero=(device_map == "balanced_low_0"),
+                max_memory=max_memory,
+                **device_map_kwargs,
+            )
+        else:
+            max_memory = get_max_memory(max_memory)
+
+        device_map_kwargs["max_memory"] = max_memory
+        device_map = infer_auto_device_map(model, dtype=torch_dtype, **device_map_kwargs)
+
+    return device_map
+
+
 def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None):
     """
     Reads a checkpoint file, returning properly formatted errors if they arise.
@@ -201,6 +225,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
     _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
     _supports_gradient_checkpointing = False
     _keys_to_ignore_on_load_unexpected = None
+    _no_split_modules = None
 
     def __init__(self):
         super().__init__()
@@ -247,6 +272,36 @@ def disable_gradient_checkpointing(self) -> None:
         if self._supports_gradient_checkpointing:
             self.apply(partial(self._set_gradient_checkpointing, value=False))
 
+    def set_use_npu_flash_attention(self, valid: bool) -> None:
+        r"""
+        Set the switch for the npu flash attention.
+        """
+
+        def fn_recursive_set_npu_flash_attention(module: torch.nn.Module):
+            if hasattr(module, "set_use_npu_flash_attention"):
+                module.set_use_npu_flash_attention(valid)
+
+            for child in module.children():
+                fn_recursive_set_npu_flash_attention(child)
+
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_npu_flash_attention(module)
+
+    def enable_npu_flash_attention(self) -> None:
+        r"""
+        Enable npu flash attention from torch_npu
+
+        """
+        self.set_use_npu_flash_attention(True)
+
+    def disable_npu_flash_attention(self) -> None:
+        r"""
+        disable npu flash attention from torch_npu
+
+        """
+        self.set_use_npu_flash_attention(False)
+
     def set_use_memory_efficient_attention_xformers(
         self, valid: bool, attention_op: Optional[Callable] = None
     ) -> None:
@@ -421,9 +476,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -505,7 +560,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
         force_download = kwargs.pop("force_download", False)
         from_flax = kwargs.pop("from_flax", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         output_loading_info = kwargs.pop("output_loading_info", False)
         local_files_only = kwargs.pop("local_files_only", None)
@@ -560,6 +615,36 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
             )
 
+        # change device_map into a map if we passed an int, a str or a torch.device
+        if isinstance(device_map, torch.device):
+            device_map = {"": device_map}
+        elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
+            try:
+                device_map = {"": torch.device(device_map)}
+            except RuntimeError:
+                raise ValueError(
+                    "When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or "
+                    f"'auto', 'balanced', 'balanced_low_0', 'sequential' but found {device_map}."
+                )
+        elif isinstance(device_map, int):
+            if device_map < 0:
+                raise ValueError(
+                    "You can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' "
+                )
+            else:
+                device_map = {"": device_map}
+
+        if device_map is not None:
+            if low_cpu_mem_usage is None:
+                low_cpu_mem_usage = True
+            elif not low_cpu_mem_usage:
+                raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`")
+
+        if low_cpu_mem_usage:
+            if device_map is not None and not is_torch_version(">=", "1.10"):
+                # The max memory utils require PyTorch >= 1.10 to have torch.cuda.mem_get_info.
+                raise ValueError("`low_cpu_mem_usage` and `device_map` require PyTorch >= 1.10.")
+
         # Load config if we don't provide a configuration
         config_path = pretrained_model_name_or_path
 
@@ -582,10 +667,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             token=token,
             revision=revision,
             subfolder=subfolder,
-            device_map=device_map,
-            max_memory=max_memory,
-            offload_folder=offload_folder,
-            offload_state_dict=offload_state_dict,
             user_agent=user_agent,
             **kwargs,
         )
@@ -690,6 +771,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 else:  # else let accelerate handle loading and dispatching.
                     # Load weights and dispatch according to the device_map
                     # by default the device_map is None and the weights are loaded on the CPU
+                    device_map = _determine_device_map(model, device_map, max_memory, torch_dtype)
                     try:
                         accelerate.load_checkpoint_and_dispatch(
                             model,
@@ -881,6 +963,36 @@ def _find_mismatched_keys(
 
         return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
 
+    # Adapted from `transformers` modeling_utils.py
+    def _get_no_split_modules(self, device_map: str):
+        """
+        Get the modules of the model that should not be spit when using device_map. We iterate through the modules to
+        get the underlying `_no_split_modules`.
+
+        Args:
+            device_map (`str`):
+                The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"]
+
+        Returns:
+            `List[str]`: List of modules that should not be split
+        """
+        _no_split_modules = set()
+        modules_to_check = [self]
+        while len(modules_to_check) > 0:
+            module = modules_to_check.pop(-1)
+            # if the module does not appear in _no_split_modules, we also check the children
+            if module.__class__.__name__ not in _no_split_modules:
+                if isinstance(module, ModelMixin):
+                    if module._no_split_modules is None:
+                        raise ValueError(
+                            f"{module.__class__.__name__} does not support `device_map='{device_map}'`. To implement support, the model "
+                            "class needs to implement the `_no_split_modules` attribute."
+                        )
+                    else:
+                        _no_split_modules = _no_split_modules | set(module._no_split_modules)
+                modules_to_check += list(module.children())
+        return list(_no_split_modules)
+
     @property
     def device(self) -> torch.device:
         """
diff --git a/src/diffusers/models/transformers/transformer_2d.py b/src/diffusers/models/transformers/transformer_2d.py
index 768fceb71ae5..6a2695b9e436 100644
--- a/src/diffusers/models/transformers/transformer_2d.py
+++ b/src/diffusers/models/transformers/transformer_2d.py
@@ -72,6 +72,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
     """
 
     _supports_gradient_checkpointing = True
+    _no_split_modules = ["BasicTransformerBlock"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py
index 34327e1049c5..697730b359ff 100644
--- a/src/diffusers/models/unets/unet_2d_condition.py
+++ b/src/diffusers/models/unets/unet_2d_condition.py
@@ -161,6 +161,7 @@ class conditioning with `class_embed_type` equal to `None`.
     """
 
     _supports_gradient_checkpointing = True
+    _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D", "CrossAttnUpBlock2D"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/models/unets/unet_3d_blocks.py b/src/diffusers/models/unets/unet_3d_blocks.py
index 97c91f61da1c..75827258f64c 100644
--- a/src/diffusers/models/unets/unet_3d_blocks.py
+++ b/src/diffusers/models/unets/unet_3d_blocks.py
@@ -121,6 +121,7 @@ def get_down_block(
             raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockMotion")
         return CrossAttnDownBlockMotion(
             num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
@@ -255,6 +256,7 @@ def get_up_block(
             raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockMotion")
         return CrossAttnUpBlockMotion(
             num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
             in_channels=in_channels,
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py
index 595b7b03571c..81cc4b1f7ad4 100644
--- a/src/diffusers/models/unets/unet_motion_model.py
+++ b/src/diffusers/models/unets/unet_motion_model.py
@@ -211,6 +211,8 @@ def __init__(
         norm_num_groups: int = 32,
         norm_eps: float = 1e-5,
         cross_attention_dim: int = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
         use_linear_projection: bool = False,
         num_attention_heads: Union[int, Tuple[int, ...]] = 8,
         motion_max_seq_length: int = 32,
@@ -218,6 +220,9 @@ def __init__(
         use_motion_mid_block: int = True,
         encoder_hid_dim: Optional[int] = None,
         encoder_hid_dim_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        projection_class_embeddings_input_dim: Optional[int] = None,
         time_cond_proj_dim: Optional[int] = None,
     ):
         super().__init__()
@@ -240,6 +245,21 @@ def __init__(
                 f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
             )
 
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
+            for layer_number_per_block in transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
+
         # input
         conv_in_kernel = 3
         conv_out_kernel = 3
@@ -260,6 +280,10 @@ def __init__(
         if encoder_hid_dim_type is None:
             self.encoder_hid_proj = None
 
+        if addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, True, 0)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+
         # class embedding
         self.down_blocks = nn.ModuleList([])
         self.up_blocks = nn.ModuleList([])
@@ -267,6 +291,15 @@ def __init__(
         if isinstance(num_attention_heads, int):
             num_attention_heads = (num_attention_heads,) * len(down_block_types)
 
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
         # down
         output_channel = block_out_channels[0]
         for i, down_block_type in enumerate(down_block_types):
@@ -276,7 +309,7 @@ def __init__(
 
             down_block = get_down_block(
                 down_block_type,
-                num_layers=layers_per_block,
+                num_layers=layers_per_block[i],
                 in_channels=input_channel,
                 out_channels=output_channel,
                 temb_channels=time_embed_dim,
@@ -284,13 +317,14 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim[i],
                 num_attention_heads=num_attention_heads[i],
                 downsample_padding=downsample_padding,
                 use_linear_projection=use_linear_projection,
                 dual_cross_attention=False,
                 temporal_num_attention_heads=motion_num_attention_heads,
                 temporal_max_seq_length=motion_max_seq_length,
+                transformer_layers_per_block=transformer_layers_per_block[i],
             )
             self.down_blocks.append(down_block)
 
@@ -302,13 +336,14 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 output_scale_factor=mid_block_scale_factor,
-                cross_attention_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim[-1],
                 num_attention_heads=num_attention_heads[-1],
                 resnet_groups=norm_num_groups,
                 dual_cross_attention=False,
                 use_linear_projection=use_linear_projection,
                 temporal_num_attention_heads=motion_num_attention_heads,
                 temporal_max_seq_length=motion_max_seq_length,
+                transformer_layers_per_block=transformer_layers_per_block[-1],
             )
 
         else:
@@ -318,11 +353,12 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 output_scale_factor=mid_block_scale_factor,
-                cross_attention_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim[-1],
                 num_attention_heads=num_attention_heads[-1],
                 resnet_groups=norm_num_groups,
                 dual_cross_attention=False,
                 use_linear_projection=use_linear_projection,
+                transformer_layers_per_block=transformer_layers_per_block[-1],
             )
 
         # count how many layers upsample the images
@@ -331,6 +367,9 @@ def __init__(
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
 
         output_channel = reversed_block_out_channels[0]
         for i, up_block_type in enumerate(up_block_types):
@@ -349,7 +388,7 @@ def __init__(
 
             up_block = get_up_block(
                 up_block_type,
-                num_layers=layers_per_block + 1,
+                num_layers=reversed_layers_per_block[i] + 1,
                 in_channels=input_channel,
                 out_channels=output_channel,
                 prev_output_channel=prev_output_channel,
@@ -358,13 +397,14 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
+                cross_attention_dim=reversed_cross_attention_dim[i],
                 num_attention_heads=reversed_num_attention_heads[i],
                 dual_cross_attention=False,
                 resolution_idx=i,
                 use_linear_projection=use_linear_projection,
                 temporal_num_attention_heads=motion_num_attention_heads,
                 temporal_max_seq_length=motion_max_seq_length,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
@@ -835,6 +875,28 @@ def forward(
         t_emb = t_emb.to(dtype=self.dtype)
 
         emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.config.addition_embed_type == "text_time":
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+
+        emb = emb if aug_emb is None else emb + aug_emb
         emb = emb.repeat_interleave(repeats=num_frames, dim=0)
         encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
 
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 169e44c9e3cd..c2dd7ac0d551 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -114,6 +114,7 @@
     _import_structure["amused"] = ["AmusedImg2ImgPipeline", "AmusedInpaintPipeline", "AmusedPipeline"]
     _import_structure["animatediff"] = [
         "AnimateDiffPipeline",
+        "AnimateDiffSDXLPipeline",
         "AnimateDiffVideoToVideoPipeline",
     ]
     _import_structure["audioldm"] = ["AudioLDMPipeline"]
@@ -367,7 +368,7 @@
         from ..utils.dummy_torch_and_transformers_objects import *
     else:
         from .amused import AmusedImg2ImgPipeline, AmusedInpaintPipeline, AmusedPipeline
-        from .animatediff import AnimateDiffPipeline, AnimateDiffVideoToVideoPipeline
+        from .animatediff import AnimateDiffPipeline, AnimateDiffSDXLPipeline, AnimateDiffVideoToVideoPipeline
         from .audioldm import AudioLDMPipeline
         from .audioldm2 import (
             AudioLDM2Pipeline,
diff --git a/src/diffusers/pipelines/animatediff/__init__.py b/src/diffusers/pipelines/animatediff/__init__.py
index 35b99a76fd21..ae6b67b1924c 100644
--- a/src/diffusers/pipelines/animatediff/__init__.py
+++ b/src/diffusers/pipelines/animatediff/__init__.py
@@ -22,6 +22,7 @@
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
     _import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline"]
+    _import_structure["pipeline_animatediff_sdxl"] = ["AnimateDiffSDXLPipeline"]
     _import_structure["pipeline_animatediff_video2video"] = ["AnimateDiffVideoToVideoPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -33,6 +34,7 @@
 
     else:
         from .pipeline_animatediff import AnimateDiffPipeline
+        from .pipeline_animatediff_sdxl import AnimateDiffSDXLPipeline
         from .pipeline_animatediff_video2video import AnimateDiffVideoToVideoPipeline
         from .pipeline_output import AnimateDiffPipelineOutput
 
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py
new file mode 100644
index 000000000000..f15cd5dbebf7
--- /dev/null
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py
@@ -0,0 +1,1284 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, MotionAdapter, UNet2DConditionModel, UNetMotionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..free_init_utils import FreeInitMixin
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import AnimateDiffPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers.models import MotionAdapter
+        >>> from diffusers import AnimateDiffSDXLPipeline, DDIMScheduler
+        >>> from diffusers.utils import export_to_gif
+
+        >>> adapter = MotionAdapter.from_pretrained(
+        ...     "a-r-r-o-w/animatediff-motion-adapter-sdxl-beta", torch_dtype=torch.float16
+        ... )
+
+        >>> model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+        >>> scheduler = DDIMScheduler.from_pretrained(
+        ...     model_id,
+        ...     subfolder="scheduler",
+        ...     clip_sample=False,
+        ...     timestep_spacing="linspace",
+        ...     beta_schedule="linear",
+        ...     steps_offset=1,
+        ... )
+        >>> pipe = AnimateDiffSDXLPipeline.from_pretrained(
+        ...     model_id,
+        ...     motion_adapter=adapter,
+        ...     scheduler=scheduler,
+        ...     torch_dtype=torch.float16,
+        ...     variant="fp16",
+        ... ).to("cuda")
+
+        >>> # enable memory savings
+        >>> pipe.enable_vae_slicing()
+        >>> pipe.enable_vae_tiling()
+
+        >>> output = pipe(
+        ...     prompt="a panda surfing in the ocean, realistic, high quality",
+        ...     negative_prompt="low quality, worst quality",
+        ...     num_inference_steps=20,
+        ...     guidance_scale=8,
+        ...     width=1024,
+        ...     height=1024,
+        ...     num_frames=16,
+        ... )
+
+        >>> frames = output.frames[0]
+        >>> export_to_gif(frames, "animation.gif")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
+def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+
+        outputs.append(batch_output)
+
+    if output_type == "np":
+        outputs = np.stack(outputs)
+
+    elif output_type == "pt":
+        outputs = torch.stack(outputs)
+
+    elif not output_type == "pil":
+        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
+
+    return outputs
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class AnimateDiffSDXLPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    FreeInitMixin,
+):
+    r"""
+    Pipeline for text-to-video generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: Union[UNet2DConditionModel, UNetMotionModel],
+        motion_adapter: MotionAdapter,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+    ):
+        super().__init__()
+
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt with num_images_per_prompt->num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_videos_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_videos_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_videos_per_prompt).view(
+            bs_embed * num_videos_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_videos_per_prompt).view(
+                bs_embed * num_videos_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        num_frames: int = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the video generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            num_frames:
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated video. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated video. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower video quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the video generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the video generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the
+                `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.AnimateDiffPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_videos_per_prompt=num_videos_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_videos_per_prompt, 1)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 7.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 8. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_videos_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+
+            self._num_timesteps = len(timesteps)
+
+            # 9. Denoising loop
+            with self.progress_bar(total=self._num_timesteps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if self.interrupt:
+                        continue
+
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # predict the noise residual
+                    added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                    if ip_adapter_image is not None or ip_adapter_image_embeds:
+                        added_cond_kwargs["image_embeds"] = image_embeds
+
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep_cond=timestep_cond,
+                        cross_attention_kwargs=self.cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=False,
+                    )[0]
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        noise_pred = rescale_noise_cfg(
+                            noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
+                        )
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                        add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                        negative_pooled_prompt_embeds = callback_outputs.pop(
+                            "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                        )
+                        add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                        negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                    progress_bar.update()
+
+        # make sure the VAE is in float32 mode, as it overflows in float16
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+        if needs_upcasting:
+            self.upcast_vae()
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+        # 10. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+
+        # 11. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py
index 79ee3fdad461..5fb497ef2e22 100644
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -45,7 +45,7 @@
 )
 from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
 from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
-from .pixart_alpha import PixArtAlphaPipeline
+from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
 from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline
 from .stable_diffusion import (
     StableDiffusionImg2ImgPipeline,
@@ -73,7 +73,8 @@
         ("wuerstchen", WuerstchenCombinedPipeline),
         ("cascade", StableCascadeCombinedPipeline),
         ("lcm", LatentConsistencyModelPipeline),
-        ("pixart", PixArtAlphaPipeline),
+        ("pixart-alpha", PixArtAlphaPipeline),
+        ("pixart-sigma", PixArtSigmaPipeline),
     ]
 )
 
@@ -216,7 +217,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
         ```
 
         Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+            pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
                 Can be either:
 
                     - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
@@ -233,9 +234,9 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
             cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                 is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -310,7 +311,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
         """
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         token = kwargs.pop("token", None)
         local_files_only = kwargs.pop("local_files_only", False)
@@ -489,7 +490,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
         ```
 
         Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+            pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
                 Can be either:
 
                     - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
@@ -506,9 +507,9 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
             cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                 is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -583,7 +584,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
         """
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         token = kwargs.pop("token", None)
         local_files_only = kwargs.pop("local_files_only", False)
@@ -765,7 +766,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
         ```
 
         Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+            pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
                 Can be either:
 
                     - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
@@ -782,9 +783,9 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
             cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                 is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -859,7 +860,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
         """
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         token = kwargs.pop("token", None)
         local_files_only = kwargs.pop("local_files_only", False)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
index a5a0aaed0f2e..022f30d819d8 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -1169,15 +1169,16 @@ def __call__(
         self._num_timesteps = len(timesteps)
 
         # 6. Prepare latent variables
-        latents = self.prepare_latents(
-            image,
-            latent_timestep,
-            batch_size,
-            num_images_per_prompt,
-            prompt_embeds.dtype,
-            device,
-            generator,
-        )
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+            )
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
index 18c4370b8025..b9c4e3c0032c 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -151,7 +151,12 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
 
 
 class StableDiffusionXLControlNetInpaintPipeline(
-    DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, IPAdapterMixin
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    TextualInversionLoaderMixin,
 ):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -160,6 +165,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
     The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
         - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
         - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
         - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
index d32e7d81649d..dfd3cc239b36 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -89,8 +89,8 @@
         ...     variant="fp16",
         ...     use_safetensors=True,
         ...     torch_dtype=torch.float16,
-        ... ).to("cuda")
-        >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda")
+        ... )
+        >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
         >>> pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
         ...     "stabilityai/stable-diffusion-xl-base-1.0",
         ...     controlnet=controlnet,
@@ -98,7 +98,7 @@
         ...     variant="fp16",
         ...     use_safetensors=True,
         ...     torch_dtype=torch.float16,
-        ... ).to("cuda")
+        ... )
         >>> pipe.enable_model_cpu_offload()
 
 
@@ -1429,16 +1429,17 @@ def __call__(
         self._num_timesteps = len(timesteps)
 
         # 6. Prepare latent variables
-        latents = self.prepare_latents(
-            image,
-            latent_timestep,
-            batch_size,
-            num_images_per_prompt,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            True,
-        )
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                True,
+            )
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
index 3c3bd526692d..c84caa1fad88 100644
--- a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
@@ -363,6 +363,7 @@ class conditioning with `class_embed_type` equal to `None`.
     """
 
     _supports_gradient_checkpointing = True
+    _no_split_modules = ["BasicTransformerBlock", "ResnetBlockFlat", "CrossAttnUpBlockFlat"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/pipelines/dit/pipeline_dit.py b/src/diffusers/pipelines/dit/pipeline_dit.py
index 289ea496028d..a3ea90874a12 100644
--- a/src/diffusers/pipelines/dit/pipeline_dit.py
+++ b/src/diffusers/pipelines/dit/pipeline_dit.py
@@ -227,6 +227,9 @@ def __call__(
         if output_type == "pil":
             samples = self.numpy_to_pil(samples)
 
+        # Offload all models
+        self.maybe_free_model_hooks()
+
         if not return_dict:
             return (samples,)
 
diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
index 8957d7140ef1..fce694d1d0bd 100644
--- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
+++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
@@ -872,9 +872,10 @@ def __call__(
             else self.scheduler.config.original_inference_steps
         )
         latent_timestep = timesteps[:1]
-        latents = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
-        )
+        if latents is None:
+            latents = self.prepare_latents(
+                image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+            )
         bs = batch_size * num_images_per_prompt
 
         # 6. Get Guidance Scale Embedding
diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
index cfab70926a4a..5ea7c2c14551 100644
--- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
@@ -1419,7 +1419,6 @@ def encode_image(self, image, dtype=None, height=None, width=None, resize_mode="
         if needs_upcasting:
             image = image.float()
             self.upcast_vae()
-            image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
 
         x0 = self.vae.encode(image).latent_dist.mode()
         x0 = x0.to(dtype)
diff --git a/src/diffusers/pipelines/pipeline_flax_utils.py b/src/diffusers/pipelines/pipeline_flax_utils.py
index b1035c1f2f42..7534149b559a 100644
--- a/src/diffusers/pipelines/pipeline_flax_utils.py
+++ b/src/diffusers/pipelines/pipeline_flax_utils.py
@@ -254,9 +254,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -316,7 +316,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         ```
         """
         cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", False)
         token = kwargs.pop("token", None)
diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py
index 15fb34e72d24..f7d9785043d1 100644
--- a/src/diffusers/pipelines/pipeline_loading_utils.py
+++ b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -435,7 +435,7 @@ def _load_empty_model(
             return_unused_kwargs=True,
             return_commit_hash=True,
             force_download=kwargs.pop("force_download", False),
-            resume_download=kwargs.pop("resume_download", False),
+            resume_download=kwargs.pop("resume_download", None),
             proxies=kwargs.pop("proxies", None),
             local_files_only=kwargs.pop("local_files_only", False),
             token=kwargs.pop("token", None),
@@ -454,7 +454,7 @@ def _load_empty_model(
             cached_folder,
             subfolder=name,
             force_download=kwargs.pop("force_download", False),
-            resume_download=kwargs.pop("resume_download", False),
+            resume_download=kwargs.pop("resume_download", None),
             proxies=kwargs.pop("proxies", None),
             local_files_only=kwargs.pop("local_files_only", False),
             token=kwargs.pop("token", None),
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 68433332546b..e5f822caa0ef 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -376,7 +376,11 @@ def module_is_sequentially_offloaded(module):
             if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
                 return False
 
-            return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.AlignDevicesHook)
+            return hasattr(module, "_hf_hook") and (
+                isinstance(module._hf_hook, accelerate.hooks.AlignDevicesHook)
+                or hasattr(module._hf_hook, "hooks")
+                and isinstance(module._hf_hook.hooks[0], accelerate.hooks.AlignDevicesHook)
+            )
 
         def module_is_offloaded(module):
             if not is_accelerate_available() or is_accelerate_version("<", "0.17.0.dev0"):
@@ -529,9 +533,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                 is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -621,7 +625,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         ```
         """
         cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", None)
@@ -1005,8 +1009,7 @@ def remove_all_hooks(self):
         """
         for _, model in self.components.items():
             if isinstance(model, torch.nn.Module) and hasattr(model, "_hf_hook"):
-                is_sequential_cpu_offload = isinstance(getattr(model, "_hf_hook"), accelerate.hooks.AlignDevicesHook)
-                accelerate.hooks.remove_hook_from_module(model, recurse=is_sequential_cpu_offload)
+                accelerate.hooks.remove_hook_from_module(model, recurse=True)
         self._all_hooks = []
 
     def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
@@ -1213,9 +1216,9 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -1268,7 +1271,7 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
 
         """
         cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", None)
diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
index c9efe594e398..22f0c507e5ee 100644
--- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
+++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
@@ -273,15 +273,6 @@ def __init__(
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
 
-    # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py
-    def mask_text_embeddings(self, emb, mask):
-        if emb.shape[0] == 1:
-            keep_index = mask.sum().item()
-            return emb[:, :, :keep_index, :], keep_index
-        else:
-            masked_feature = emb * mask[:, None, :, None]
-            return masked_feature, emb.shape[2]
-
     # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
     def encode_prompt(
         self,
diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py
index dadf41c176ea..e3dd3ddc0404 100644
--- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py
+++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py
@@ -199,16 +199,7 @@ def __init__(
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
 
-    # copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.py
-    def mask_text_embeddings(self, emb, mask):
-        if emb.shape[0] == 1:
-            keep_index = mask.sum().item()
-            return emb[:, :, :keep_index, :], keep_index
-        else:
-            masked_feature = emb * mask[:, None, :, None]
-            return masked_feature, emb.shape[2]
-
-    # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
+    # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
@@ -369,7 +360,7 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
-    # copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.py
+    # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.check_inputs
     def check_inputs(
         self,
         prompt,
@@ -462,7 +453,7 @@ def process(text: str):
 
         return [process(t) for t in text]
 
-    # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline._clean_caption
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
     def _clean_caption(self, caption):
         caption = str(caption)
         caption = ul.unquote_plus(caption)
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 02e32633cedb..700ca5db6f07 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -239,15 +239,15 @@ def __call__(
 
         num_embeddings = self.prior.config.num_embeddings
         embedding_dim = self.prior.config.embedding_dim
-
-        latents = self.prepare_latents(
-            (batch_size, num_embeddings * embedding_dim),
-            image_embeds.dtype,
-            device,
-            generator,
-            latents,
-            self.scheduler,
-        )
+        if latents is None:
+            latents = self.prepare_latents(
+                (batch_size, num_embeddings * embedding_dim),
+                image_embeds.dtype,
+                device,
+                generator,
+                latents,
+                self.scheduler,
+            )
 
         # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
         latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim)
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index 0eda32d333b9..8ce08aec66ca 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -113,7 +113,6 @@
         from .pipeline_stable_diffusion import (
             StableDiffusionPipeline,
             StableDiffusionPipelineOutput,
-            StableDiffusionSafetyChecker,
         )
         from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
         from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index f04a21ef4857..b4b1b885dd3c 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -557,7 +557,7 @@ def convert_ldm_unet_checkpoint(
                 paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
             )
 
-            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            output_block_list = {k: sorted(v) for k, v in sorted(output_block_list.items())}
             if ["conv.bias", "conv.weight"] in output_block_list.values():
                 index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
                 new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index de2767e23952..0bf5a92a4fcc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -172,6 +172,7 @@ def __call__(
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -296,6 +297,8 @@ def __call__(
             negative_prompt,
             prompt_embeds,
             negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
             callback_on_step_end_tensor_inputs,
         )
         self._guidance_scale = guidance_scale
@@ -303,14 +306,6 @@ def __call__(
 
         device = self._execution_device
 
-        if ip_adapter_image is not None:
-            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
-            image_embeds, negative_image_embeds = self.encode_image(
-                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
-            )
-            if self.do_classifier_free_guidance:
-                image_embeds = torch.cat([image_embeds, negative_image_embeds, negative_image_embeds])
-
         if image is None:
             raise ValueError("`image` input cannot be undefined.")
 
@@ -335,6 +330,14 @@ def __call__(
             negative_prompt_embeds=negative_prompt_embeds,
         )
 
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
         # 3. Preprocess image
         image = self.image_processor.preprocess(image)
 
@@ -635,6 +638,65 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state
 
             return image_embeds, uncond_image_embeds
 
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat(
+                        [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
+                    )
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    (
+                        single_image_embeds,
+                        single_negative_image_embeds,
+                        single_negative_image_embeds,
+                    ) = single_image_embeds.chunk(3)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat(
+                        [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
+                    )
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is None:
@@ -687,6 +749,8 @@ def check_inputs(
         negative_prompt=None,
         prompt_embeds=None,
         negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
         callback_on_step_end_tensor_inputs=None,
     ):
         if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
@@ -728,6 +792,21 @@ def check_inputs(
                     f" {negative_prompt_embeds.shape}."
                 )
 
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
         shape = (
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
index fe19b4de3127..134ec39effc5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -786,16 +786,17 @@ def __call__(
 
         # 6. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size=batch_size,
-            num_channels_latents=num_channels_latents,
-            height=height,
-            width=width,
-            dtype=prompt_embeds.dtype,
-            device=device,
-            generator=generator,
-            latents=latents,
-        )
+        if latents is None:
+            latents = self.prepare_latents(
+                batch_size=batch_size,
+                num_channels_latents=num_channels_latents,
+                height=height,
+                width=width,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+                latents=latents,
+            )
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py
index 6cc4d26f29b4..3e6dec3e0bff 100644
--- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py
+++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py
@@ -31,6 +31,7 @@ def cosine_distance(image_embeds, text_embeds):
 
 class StableDiffusionSafetyChecker(PreTrainedModel):
     config_class = CLIPConfig
+    main_input_name = "clip_input"
 
     _no_split_modules = ["CLIPEncoderLayer"]
 
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index b72b19d5c1ef..b98ea279c1a2 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -1247,17 +1247,19 @@ def denoising_value_valid(dnv):
         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
 
         add_noise = True if self.denoising_start is None else False
+
         # 6. Prepare latent variables
-        latents = self.prepare_latents(
-            image,
-            latent_timestep,
-            batch_size,
-            num_images_per_prompt,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            add_noise,
-        )
+        if latents is None:
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                add_noise,
+            )
         # 7. Prepare extra step kwargs.
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
index a2242bb099c5..d9380020b329 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -436,7 +436,6 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.check_inputs
     def check_inputs(
         self,
         prompt,
@@ -526,8 +525,8 @@ def prepare_image_latents(
             # make sure the VAE is in float32 mode, as it overflows in float16
             needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
             if needs_upcasting:
+                image = image.float()
                 self.upcast_vae()
-                image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
 
             image_latents = retrieve_latents(self.vae.encode(image), sample_mode="argmax")
 
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/watermark.py b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
index 5b6e36d9f447..f457cdbdb1eb 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
@@ -28,9 +28,15 @@ def apply_watermark(self, images: torch.FloatTensor):
 
         images = (255 * (images / 2 + 0.5)).cpu().permute(0, 2, 3, 1).float().numpy()
 
-        images = [self.encoder.encode(image, "dwtDct") for image in images]
+        # Convert RGB to BGR, which is the channel order expected by the watermark encoder.
+        images = images[:, :, :, ::-1]
 
-        images = torch.from_numpy(np.array(images)).permute(0, 3, 1, 2)
+        # Add watermark and convert BGR back to RGB
+        images = [self.encoder.encode(image, "dwtDct")[:, :, ::-1] for image in images]
+
+        images = np.array(images)
+
+        images = torch.from_numpy(images).permute(0, 3, 1, 2)
 
         images = torch.clamp(2 * (images / 255 - 0.5), min=-1.0, max=1.0)
         return images
diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index 070183b92409..da6832cebd4d 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -199,6 +199,9 @@ def _encode_vae_image(
         image = image.to(device=device)
         image_latents = self.vae.encode(image).latent_dist.mode()
 
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+
         if do_classifier_free_guidance:
             negative_image_latents = torch.zeros_like(image_latents)
 
@@ -207,9 +210,6 @@ def _encode_vae_image(
             # to avoid doing two forward passes
             image_latents = torch.cat([negative_image_latents, image_latents])
 
-        # duplicate image_latents for each generation per prompt, using mps friendly method
-        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
-
         return image_latents
 
     def _get_add_time_ids(
diff --git a/src/diffusers/schedulers/deprecated/__init__.py b/src/diffusers/schedulers/deprecated/__init__.py
index 786707f45206..479cf9bd568b 100644
--- a/src/diffusers/schedulers/deprecated/__init__.py
+++ b/src/diffusers/schedulers/deprecated/__init__.py
@@ -30,7 +30,7 @@
             raise OptionalDependencyNotAvailable()
 
     except OptionalDependencyNotAvailable:
-        from ..utils.dummy_pt_objects import *  # noqa F403
+        from ...utils.dummy_pt_objects import *  # noqa F403
     else:
         from .scheduling_karras_ve import KarrasVeScheduler
         from .scheduling_sde_vp import ScoreSdeVpScheduler
diff --git a/src/diffusers/schedulers/scheduling_ddpm_flax.py b/src/diffusers/schedulers/scheduling_ddpm_flax.py
index 6bdfa5eb5e0d..d06a171159ee 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_flax.py
@@ -222,9 +222,13 @@ def step(
         t = timestep
 
         if key is None:
-            key = jax.random.PRNGKey(0)
+            key = jax.random.key(0)
 
-        if model_output.shape[1] == sample.shape[1] * 2 and self.config.variance_type in ["learned", "learned_range"]:
+        if (
+            len(model_output.shape) > 1
+            and model_output.shape[1] == sample.shape[1] * 2
+            and self.config.variance_type in ["learned", "learned_range"]
+        ):
             model_output, predicted_variance = jnp.split(model_output, sample.shape[1], axis=1)
         else:
             predicted_variance = None
@@ -264,7 +268,7 @@ def step(
 
         # 6. Add noise
         def random_variance():
-            split_key = jax.random.split(key, num=1)
+            split_key = jax.random.split(key, num=1)[0]
             noise = jax.random.normal(split_key, shape=model_output.shape, dtype=self.dtype)
             return (self._get_variance(state, t, predicted_variance=predicted_variance) ** 0.5) * noise
 
diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
index 26a41d7335c5..dfc7978a2ee2 100644
--- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
@@ -14,6 +14,7 @@
 
 # DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver and https://github.com/NVlabs/edm
 
+import math
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -44,6 +45,10 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
             range is [0.2, 80.0].
         sigma_data (`float`, *optional*, defaults to 0.5):
             The standard deviation of the data distribution. This is set to 0.5 in the EDM paper [1].
+        sigma_schedule (`str`, *optional*, defaults to `karras`):
+            Sigma schedule to compute the `sigmas`. By default, we the schedule introduced in the EDM paper
+            (https://arxiv.org/abs/2206.00364). Other acceptable value is "exponential". The exponential schedule was
+            incorporated in this model: https://huggingface.co/stabilityai/cosxl.
         num_train_timesteps (`int`, defaults to 1000):
             The number of diffusion steps to train the model.
         solver_order (`int`, defaults to 2):
@@ -89,6 +94,7 @@ def __init__(
         sigma_min: float = 0.002,
         sigma_max: float = 80.0,
         sigma_data: float = 0.5,
+        sigma_schedule: str = "karras",
         num_train_timesteps: int = 1000,
         prediction_type: str = "epsilon",
         rho: float = 7.0,
@@ -121,7 +127,11 @@ def __init__(
             )
 
         ramp = torch.linspace(0, 1, num_train_timesteps)
-        sigmas = self._compute_sigmas(ramp)
+        if sigma_schedule == "karras":
+            sigmas = self._compute_karras_sigmas(ramp)
+        elif sigma_schedule == "exponential":
+            sigmas = self._compute_exponential_sigmas(ramp)
+
         self.timesteps = self.precondition_noise(sigmas)
 
         self.sigmas = self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
@@ -236,7 +246,10 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         self.num_inference_steps = num_inference_steps
 
         ramp = np.linspace(0, 1, self.num_inference_steps)
-        sigmas = self._compute_sigmas(ramp)
+        if self.config.sigma_schedule == "karras":
+            sigmas = self._compute_karras_sigmas(ramp)
+        elif self.config.sigma_schedule == "exponential":
+            sigmas = self._compute_exponential_sigmas(ramp)
 
         sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
         self.timesteps = self.precondition_noise(sigmas)
@@ -262,10 +275,9 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         self._begin_index = None
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-    # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
-    def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas
+    def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
-
         sigma_min = sigma_min or self.config.sigma_min
         sigma_max = sigma_max or self.config.sigma_max
 
@@ -273,6 +285,18 @@ def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTe
         min_inv_rho = sigma_min ** (1 / rho)
         max_inv_rho = sigma_max ** (1 / rho)
         sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_exponential_sigmas
+    def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
+        """Implementation closely follows k-diffusion.
+
+        https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26
+        """
+        sigma_min = sigma_min or self.config.sigma_min
+        sigma_max = sigma_max or self.config.sigma_max
+        sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), len(ramp)).exp().flip(0)
         return sigmas
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py
index f6a09ca1ee16..0ef9263c9e30 100644
--- a/src/diffusers/schedulers/scheduling_edm_euler.py
+++ b/src/diffusers/schedulers/scheduling_edm_euler.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
@@ -65,6 +66,10 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
             range is [0.2, 80.0].
         sigma_data (`float`, *optional*, defaults to 0.5):
             The standard deviation of the data distribution. This is set to 0.5 in the EDM paper [1].
+        sigma_schedule (`str`, *optional*, defaults to `karras`):
+            Sigma schedule to compute the `sigmas`. By default, we the schedule introduced in the EDM paper
+            (https://arxiv.org/abs/2206.00364). Other acceptable value is "exponential". The exponential schedule was
+            incorporated in this model: https://huggingface.co/stabilityai/cosxl.
         num_train_timesteps (`int`, defaults to 1000):
             The number of diffusion steps to train the model.
         prediction_type (`str`, defaults to `epsilon`, *optional*):
@@ -84,15 +89,23 @@ def __init__(
         sigma_min: float = 0.002,
         sigma_max: float = 80.0,
         sigma_data: float = 0.5,
+        sigma_schedule: str = "karras",
         num_train_timesteps: int = 1000,
         prediction_type: str = "epsilon",
         rho: float = 7.0,
     ):
+        if sigma_schedule not in ["karras", "exponential"]:
+            raise ValueError(f"Wrong value for provided for `{sigma_schedule=}`.`")
+
         # setable values
         self.num_inference_steps = None
 
         ramp = torch.linspace(0, 1, num_train_timesteps)
-        sigmas = self._compute_sigmas(ramp)
+        if sigma_schedule == "karras":
+            sigmas = self._compute_karras_sigmas(ramp)
+        elif sigma_schedule == "exponential":
+            sigmas = self._compute_exponential_sigmas(ramp)
+
         self.timesteps = self.precondition_noise(sigmas)
 
         self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
@@ -200,7 +213,10 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self.num_inference_steps = num_inference_steps
 
         ramp = np.linspace(0, 1, self.num_inference_steps)
-        sigmas = self._compute_sigmas(ramp)
+        if self.config.sigma_schedule == "karras":
+            sigmas = self._compute_karras_sigmas(ramp)
+        elif self.config.sigma_schedule == "exponential":
+            sigmas = self._compute_exponential_sigmas(ramp)
 
         sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
         self.timesteps = self.precondition_noise(sigmas)
@@ -211,9 +227,8 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
-    def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
+    def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
-
         sigma_min = sigma_min or self.config.sigma_min
         sigma_max = sigma_max or self.config.sigma_max
 
@@ -221,6 +236,17 @@ def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTe
         min_inv_rho = sigma_min ** (1 / rho)
         max_inv_rho = sigma_max ** (1 / rho)
         sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+
+        return sigmas
+
+    def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
+        """Implementation closely follows k-diffusion.
+
+        https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26
+        """
+        sigma_min = sigma_min or self.config.sigma_min
+        sigma_max = sigma_max or self.config.sigma_max
+        sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), len(ramp)).exp().flip(0)
         return sigmas
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 1e3252c0bd39..be5bbc235878 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -576,5 +576,44 @@ def add_noise(
         noisy_samples = original_samples + noise * sigma
         return noisy_samples
 
+    def get_velocity(
+        self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        if (
+            isinstance(timesteps, int)
+            or isinstance(timesteps, torch.IntTensor)
+            or isinstance(timesteps, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.get_velocity()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if sample.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32)
+            timesteps = timesteps.to(sample.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(sample.device)
+            timesteps = timesteps.to(sample.device)
+
+        step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        alphas_cumprod = self.alphas_cumprod.to(sample)
+        sqrt_alpha_prod = alphas_cumprod[step_indices] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[step_indices]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
     def __len__(self):
         return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_utils.py b/src/diffusers/schedulers/scheduling_utils.py
index 5dbdb82884bc..dcdce6c51f05 100644
--- a/src/diffusers/schedulers/scheduling_utils.py
+++ b/src/diffusers/schedulers/scheduling_utils.py
@@ -112,9 +112,9 @@ def from_pretrained(
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
diff --git a/src/diffusers/schedulers/scheduling_utils_flax.py b/src/diffusers/schedulers/scheduling_utils_flax.py
index a1d471f910e5..360ca4705e02 100644
--- a/src/diffusers/schedulers/scheduling_utils_flax.py
+++ b/src/diffusers/schedulers/scheduling_utils_flax.py
@@ -102,9 +102,9 @@ def from_pretrained(
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
diff --git a/src/diffusers/training_utils.py b/src/diffusers/training_utils.py
index 25e02a3d1492..b617dd2eef39 100644
--- a/src/diffusers/training_utils.py
+++ b/src/diffusers/training_utils.py
@@ -1,12 +1,13 @@
 import contextlib
 import copy
 import random
-from typing import Any, Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 
 from .models import UNet2DConditionModel
+from .schedulers import SchedulerMixin
 from .utils import (
     convert_state_dict_to_diffusers,
     convert_state_dict_to_peft,
@@ -117,6 +118,60 @@ def resolve_interpolation_mode(interpolation_type: str):
     return interpolation_mode
 
 
+def compute_dream_and_update_latents(
+    unet: UNet2DConditionModel,
+    noise_scheduler: SchedulerMixin,
+    timesteps: torch.Tensor,
+    noise: torch.Tensor,
+    noisy_latents: torch.Tensor,
+    target: torch.Tensor,
+    encoder_hidden_states: torch.Tensor,
+    dream_detail_preservation: float = 1.0,
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    """
+    Implements "DREAM (Diffusion Rectification and Estimation-Adaptive Models)" from http://arxiv.org/abs/2312.00210.
+    DREAM helps align training with sampling to help training be more efficient and accurate at the cost of an extra
+    forward step without gradients.
+
+    Args:
+        `unet`: The state unet to use to make a prediction.
+        `noise_scheduler`: The noise scheduler used to add noise for the given timestep.
+        `timesteps`: The timesteps for the noise_scheduler to user.
+        `noise`: A tensor of noise in the shape of noisy_latents.
+        `noisy_latents`: Previously noise latents from the training loop.
+        `target`: The ground-truth tensor to predict after eps is removed.
+        `encoder_hidden_states`: Text embeddings from the text model.
+        `dream_detail_preservation`: A float value that indicates detail preservation level.
+          See reference.
+
+    Returns:
+        `tuple[torch.Tensor, torch.Tensor]`: Adjusted noisy_latents and target.
+    """
+    alphas_cumprod = noise_scheduler.alphas_cumprod.to(timesteps.device)[timesteps, None, None, None]
+    sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+
+    # The paper uses lambda = sqrt(1 - alpha) ** p, with p = 1 in their experiments.
+    dream_lambda = sqrt_one_minus_alphas_cumprod**dream_detail_preservation
+
+    pred = None
+    with torch.no_grad():
+        pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+    noisy_latents, target = (None, None)
+    if noise_scheduler.config.prediction_type == "epsilon":
+        predicted_noise = pred
+        delta_noise = (noise - predicted_noise).detach()
+        delta_noise.mul_(dream_lambda)
+        noisy_latents = noisy_latents.add(sqrt_one_minus_alphas_cumprod * delta_noise)
+        target = target.add(delta_noise)
+    elif noise_scheduler.config.prediction_type == "v_prediction":
+        raise NotImplementedError("DREAM has not been implemented for v-prediction")
+    else:
+        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+    return noisy_latents, target
+
+
 def unet_lora_state_dict(unet: UNet2DConditionModel) -> Dict[str, torch.Tensor]:
     r"""
     Returns:
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index b750b60491a4..0583cf839ff7 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -92,6 +92,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class AnimateDiffSDXLPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class AnimateDiffVideoToVideoPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/src/diffusers/utils/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py
index add95812122c..8df418f3fdd4 100644
--- a/src/diffusers/utils/dynamic_modules_utils.py
+++ b/src/diffusers/utils/dynamic_modules_utils.py
@@ -201,7 +201,7 @@ def get_cached_module_file(
     module_file: str,
     cache_dir: Optional[Union[str, os.PathLike]] = None,
     force_download: bool = False,
-    resume_download: bool = False,
+    resume_download: Optional[bool] = None,
     proxies: Optional[Dict[str, str]] = None,
     token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
@@ -228,9 +228,9 @@ def get_cached_module_file(
             cache should not be used.
         force_download (`bool`, *optional*, defaults to `False`):
             Whether or not to force to (re-)download the configuration files and override the cached versions if they
-            exist.
-        resume_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+            exist. resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+                of Diffusers.
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
@@ -380,7 +380,7 @@ def get_class_from_dynamic_module(
     class_name: Optional[str] = None,
     cache_dir: Optional[Union[str, os.PathLike]] = None,
     force_download: bool = False,
-    resume_download: bool = False,
+    resume_download: Optional[bool] = None,
     proxies: Optional[Dict[str, str]] = None,
     token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
@@ -417,8 +417,9 @@ def get_class_from_dynamic_module(
         force_download (`bool`, *optional*, defaults to `False`):
             Whether or not to force to (re-)download the configuration files and override the cached versions if they
             exist.
-        resume_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+        resume_download:
+            Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 of
+            Diffusers.
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py
index d70ee53aaa41..83f02848fcf4 100644
--- a/src/diffusers/utils/hub_utils.py
+++ b/src/diffusers/utils/hub_utils.py
@@ -283,7 +283,7 @@ def _get_model_file(
     cache_dir: Optional[str] = None,
     force_download: bool = False,
     proxies: Optional[Dict] = None,
-    resume_download: bool = False,
+    resume_download: Optional[bool] = None,
     local_files_only: bool = False,
     token: Optional[str] = None,
     user_agent: Optional[Union[Dict, str]] = None,
diff --git a/src/diffusers/utils/peft_utils.py b/src/diffusers/utils/peft_utils.py
index 8ea12e2e3b3f..ca55192ff7ae 100644
--- a/src/diffusers/utils/peft_utils.py
+++ b/src/diffusers/utils/peft_utils.py
@@ -246,7 +246,13 @@ def get_module_weight(weight_for_adapter, module_name):
         for layer_name, weight_ in weight_for_adapter.items():
             if layer_name in module_name:
                 return weight_
-        raise RuntimeError(f"No LoRA weight found for module {module_name}.")
+
+        parts = module_name.split(".")
+        # e.g. key = "down_blocks.1.attentions.0"
+        key = f"{parts[0]}.{parts[1]}.attentions.{parts[3]}"
+        block_weight = weight_for_adapter.get(key, 1.0)
+
+        return block_weight
 
     # iterate over each adapter, make it active and set the corresponding scaling weight
     for adapter_name, weight in zip(adapter_names, weights):
diff --git a/tests/lora/test_lora_layers_sdxl.py b/tests/lora/test_lora_layers_sdxl.py
index b46b887d10fb..a8b2d2759f41 100644
--- a/tests/lora/test_lora_layers_sdxl.py
+++ b/tests/lora/test_lora_layers_sdxl.py
@@ -202,6 +202,36 @@ def test_sdxl_1_0_lora(self):
         pipe.unload_lora_weights()
         release_memory(pipe)
 
+    def test_sdxl_1_0_blockwise_lora(self):
+        generator = torch.Generator("cpu").manual_seed(0)
+
+        pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.enable_model_cpu_offload()
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, adapter_name="offset")
+        scales = {
+            "unet": {
+                "down": {"block_1": [1.0, 1.0], "block_2": [1.0, 1.0]},
+                "mid": 1.0,
+                "up": {"block_0": [1.0, 1.0, 1.0], "block_1": [1.0, 1.0, 1.0]},
+            },
+        }
+        pipe.set_adapters(["offset"], [scales])
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
+
+        max_diff = numpy_cosine_similarity_distance(expected, images)
+        assert max_diff < 1e-4
+
+        pipe.unload_lora_weights()
+        release_memory(pipe)
+
     def test_sdxl_lcm_lora(self):
         pipe = StableDiffusionXLPipeline.from_pretrained(
             "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index f919ba10fbb7..59369b509876 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -24,14 +24,20 @@
 import numpy as np
 import requests_mock
 import torch
+from accelerate.utils import compute_module_sizes
 from huggingface_hub import ModelCard, delete_repo
 from huggingface_hub.utils import is_jinja_available
 from requests.exceptions import HTTPError
 
 from diffusers.models import UNet2DConditionModel
-from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0, XFormersAttnProcessor
+from diffusers.models.attention_processor import (
+    AttnProcessor,
+    AttnProcessor2_0,
+    AttnProcessorNPU,
+    XFormersAttnProcessor,
+)
 from diffusers.training_utils import EMAModel
-from diffusers.utils import is_xformers_available, logging
+from diffusers.utils import is_torch_npu_available, is_xformers_available, logging
 from diffusers.utils.testing_utils import (
     CaptureLogger,
     get_python_version,
@@ -39,6 +45,7 @@
     require_torch_2,
     require_torch_accelerator_with_training,
     require_torch_gpu,
+    require_torch_multi_gpu,
     run_test_in_subprocess,
     torch_device,
 )
@@ -200,6 +207,21 @@ class ModelTesterMixin:
     main_input_name = None  # overwrite in model specific tester class
     base_precision = 1e-3
     forward_requires_fresh_args = False
+    model_split_percents = [0.5, 0.7, 0.9]
+
+    def check_device_map_is_respected(self, model, device_map):
+        for param_name, param in model.named_parameters():
+            # Find device in device_map
+            while len(param_name) > 0 and param_name not in device_map:
+                param_name = ".".join(param_name.split(".")[:-1])
+            if param_name not in device_map:
+                raise ValueError("device map is incomplete, it does not contain any device for `param_name`.")
+
+            param_device = device_map[param_name]
+            if param_device in ["cpu", "disk"]:
+                self.assertEqual(param.device, torch.device("meta"))
+            else:
+                self.assertEqual(param.device, torch.device(param_device))
 
     def test_from_save_pretrained(self, expected_max_diff=5e-5):
         if self.forward_requires_fresh_args:
@@ -283,6 +305,53 @@ def test_getattr_is_correct(self):
 
         assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'"
 
+    @unittest.skipIf(
+        torch_device != "npu" or not is_torch_npu_available(),
+        reason="torch npu flash attention is only available with NPU and `torch_npu` installed",
+    )
+    def test_set_torch_npu_flash_attn_processor_determinism(self):
+        torch.use_deterministic_algorithms(False)
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        if not hasattr(model, "set_attn_processor"):
+            # If not has `set_attn_processor`, skip test
+            return
+
+        model.set_default_attn_processor()
+        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output = model(**self.inputs_dict(0))[0]
+            else:
+                output = model(**inputs_dict)[0]
+
+        model.enable_npu_flash_attention()
+        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_2 = model(**self.inputs_dict(0))[0]
+            else:
+                output_2 = model(**inputs_dict)[0]
+
+        model.set_attn_processor(AttnProcessorNPU())
+        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_3 = model(**self.inputs_dict(0))[0]
+            else:
+                output_3 = model(**inputs_dict)[0]
+
+        torch.use_deterministic_algorithms(True)
+
+        assert torch.allclose(output, output_2, atol=self.base_precision)
+        assert torch.allclose(output, output_3, atol=self.base_precision)
+        assert torch.allclose(output_2, output_3, atol=self.base_precision)
+
     @unittest.skipIf(
         torch_device != "cuda" or not is_xformers_available(),
         reason="XFormers attention is only available with CUDA and `xformers` installed",
@@ -670,6 +739,129 @@ def test_deprecated_kwargs(self):
                 " from `_deprecated_kwargs = [<deprecated_argument>]`"
             )
 
+    @require_torch_gpu
+    def test_cpu_offload(self):
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        if model._no_split_modules is None:
+            return
+
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_sizes(model)[""]
+        # We test several splits of sizes to make sure it works.
+        max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir)
+
+            for max_size in max_gpu_sizes:
+                max_memory = {0: max_size, "cpu": model_size * 2}
+                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
+                # Making sure part of the model will actually end up offloaded
+                self.assertSetEqual(set(new_model.hf_device_map.values()), {0, "cpu"})
+
+                self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+                torch.manual_seed(0)
+                new_output = new_model(**inputs_dict)
+
+                self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+    @require_torch_gpu
+    def test_disk_offload_without_safetensors(self):
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        if model._no_split_modules is None:
+            return
+
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_sizes(model)[""]
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir, safe_serialization=False)
+
+            with self.assertRaises(ValueError):
+                max_size = int(self.model_split_percents[0] * model_size)
+                max_memory = {0: max_size, "cpu": max_size}
+                # This errors out because it's missing an offload folder
+                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
+
+            max_size = int(self.model_split_percents[0] * model_size)
+            max_memory = {0: max_size, "cpu": max_size}
+            new_model = self.model_class.from_pretrained(
+                tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir
+            )
+
+            self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+            torch.manual_seed(0)
+            new_output = new_model(**inputs_dict)
+
+            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+    @require_torch_gpu
+    def test_disk_offload_with_safetensors(self):
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        if model._no_split_modules is None:
+            return
+
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_sizes(model)[""]
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir)
+
+            max_size = int(self.model_split_percents[0] * model_size)
+            max_memory = {0: max_size, "cpu": max_size}
+            new_model = self.model_class.from_pretrained(
+                tmp_dir, device_map="auto", offload_folder=tmp_dir, max_memory=max_memory
+            )
+
+            self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+            torch.manual_seed(0)
+            new_output = new_model(**inputs_dict)
+
+            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+    @require_torch_multi_gpu
+    def test_model_parallelism(self):
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        if model._no_split_modules is None:
+            return
+
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_sizes(model)[""]
+        # We test several splits of sizes to make sure it works.
+        max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir)
+
+            for max_size in max_gpu_sizes:
+                max_memory = {0: max_size, 1: model_size * 2, "cpu": model_size * 2}
+                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
+                # Making sure part of the model will actually end up offloaded
+                self.assertSetEqual(set(new_model.hf_device_map.values()), {0, 1})
+
+                self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+
+                torch.manual_seed(0)
+                new_output = new_model(**inputs_dict)
+
+                self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
 
 @is_staging_test
 class ModelPushToHubTester(unittest.TestCase):
diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py
index 1b8a998cfd66..33aa6a10377b 100644
--- a/tests/models/unets/test_models_unet_2d_condition.py
+++ b/tests/models/unets/test_models_unet_2d_condition.py
@@ -300,6 +300,8 @@ def create_custom_diffusion_layers(model, mock_weights: bool = True):
 class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
     model_class = UNet2DConditionModel
     main_input_name = "sample"
+    # We override the items here because the unet under consideration is small.
+    model_split_percents = [0.5, 0.3, 0.4]
 
     @property
     def dummy_input(self):
diff --git a/tests/models/unets/test_models_unet_controlnetxs.py b/tests/models/unets/test_models_unet_controlnetxs.py
index 8c9b43a20ad6..6f3662e01750 100644
--- a/tests/models/unets/test_models_unet_controlnetxs.py
+++ b/tests/models/unets/test_models_unet_controlnetxs.py
@@ -22,11 +22,7 @@
 
 from diffusers import ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel
 from diffusers.utils import logging
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    torch_device,
-)
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, is_flaky, torch_device
 
 from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
 
@@ -305,6 +301,7 @@ def _set_gradient_checkpointing_new(self, module, value=False):
         assert set(modules_with_gc_enabled.keys()) == EXPECTED_SET
         assert all(modules_with_gc_enabled.values()), "All modules should be enabled"
 
+    @is_flaky
     def test_forward_no_control(self):
         unet = self.get_dummy_unet()
         controlnet = self.get_dummy_controlnet_from_unet(unet)
diff --git a/tests/pipelines/amused/test_amused.py b/tests/pipelines/amused/test_amused.py
index f03751e2f830..9a9e2551d642 100644
--- a/tests/pipelines/amused/test_amused.py
+++ b/tests/pipelines/amused/test_amused.py
@@ -38,17 +38,17 @@ class AmusedPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     def get_dummy_components(self):
         torch.manual_seed(0)
         transformer = UVit2DModel(
-            hidden_size=32,
+            hidden_size=8,
             use_bias=False,
             hidden_dropout=0.0,
-            cond_embed_dim=32,
+            cond_embed_dim=8,
             micro_cond_encode_dim=2,
             micro_cond_embed_dim=10,
-            encoder_hidden_size=32,
+            encoder_hidden_size=8,
             vocab_size=32,
-            codebook_size=32,
-            in_channels=32,
-            block_out_channels=32,
+            codebook_size=8,
+            in_channels=8,
+            block_out_channels=8,
             num_res_blocks=1,
             downsample=True,
             upsample=True,
@@ -56,7 +56,7 @@ def get_dummy_components(self):
             num_hidden_layers=1,
             num_attention_heads=1,
             attention_dropout=0.0,
-            intermediate_size=32,
+            intermediate_size=8,
             layer_norm_eps=1e-06,
             ln_elementwise_affine=True,
         )
@@ -64,17 +64,17 @@ def get_dummy_components(self):
         torch.manual_seed(0)
         vqvae = VQModel(
             act_fn="silu",
-            block_out_channels=[32],
+            block_out_channels=[8],
             down_block_types=[
                 "DownEncoderBlock2D",
             ],
             in_channels=3,
-            latent_channels=32,
-            layers_per_block=2,
-            norm_num_groups=32,
-            num_vq_embeddings=32,
+            latent_channels=8,
+            layers_per_block=1,
+            norm_num_groups=8,
+            num_vq_embeddings=8,
             out_channels=3,
-            sample_size=32,
+            sample_size=8,
             up_block_types=[
                 "UpDecoderBlock2D",
             ],
@@ -85,14 +85,14 @@ def get_dummy_components(self):
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
             eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=64,
+            hidden_size=8,
+            intermediate_size=8,
             layer_norm_eps=1e-05,
-            num_attention_heads=8,
-            num_hidden_layers=3,
+            num_attention_heads=1,
+            num_hidden_layers=1,
             pad_token_id=1,
             vocab_size=1000,
-            projection_dim=32,
+            projection_dim=8,
         )
         text_encoder = CLIPTextModelWithProjection(text_encoder_config)
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
diff --git a/tests/pipelines/amused/test_amused_img2img.py b/tests/pipelines/amused/test_amused_img2img.py
index efbca1f437a4..24bc34d330e9 100644
--- a/tests/pipelines/amused/test_amused_img2img.py
+++ b/tests/pipelines/amused/test_amused_img2img.py
@@ -42,17 +42,17 @@ class AmusedImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     def get_dummy_components(self):
         torch.manual_seed(0)
         transformer = UVit2DModel(
-            hidden_size=32,
+            hidden_size=8,
             use_bias=False,
             hidden_dropout=0.0,
-            cond_embed_dim=32,
+            cond_embed_dim=8,
             micro_cond_encode_dim=2,
             micro_cond_embed_dim=10,
-            encoder_hidden_size=32,
+            encoder_hidden_size=8,
             vocab_size=32,
-            codebook_size=32,
-            in_channels=32,
-            block_out_channels=32,
+            codebook_size=8,
+            in_channels=8,
+            block_out_channels=8,
             num_res_blocks=1,
             downsample=True,
             upsample=True,
@@ -60,7 +60,7 @@ def get_dummy_components(self):
             num_hidden_layers=1,
             num_attention_heads=1,
             attention_dropout=0.0,
-            intermediate_size=32,
+            intermediate_size=8,
             layer_norm_eps=1e-06,
             ln_elementwise_affine=True,
         )
@@ -68,17 +68,17 @@ def get_dummy_components(self):
         torch.manual_seed(0)
         vqvae = VQModel(
             act_fn="silu",
-            block_out_channels=[32],
+            block_out_channels=[8],
             down_block_types=[
                 "DownEncoderBlock2D",
             ],
             in_channels=3,
-            latent_channels=32,
-            layers_per_block=2,
-            norm_num_groups=32,
-            num_vq_embeddings=32,
+            latent_channels=8,
+            layers_per_block=1,
+            norm_num_groups=8,
+            num_vq_embeddings=32,  # reducing this to 16 or 8 -> RuntimeError: "cdist_cuda" not implemented for 'Half'
             out_channels=3,
-            sample_size=32,
+            sample_size=8,
             up_block_types=[
                 "UpDecoderBlock2D",
             ],
@@ -89,14 +89,14 @@ def get_dummy_components(self):
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
             eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=64,
+            hidden_size=8,
+            intermediate_size=8,
             layer_norm_eps=1e-05,
-            num_attention_heads=8,
-            num_hidden_layers=3,
+            num_attention_heads=1,
+            num_hidden_layers=1,
             pad_token_id=1,
             vocab_size=1000,
-            projection_dim=32,
+            projection_dim=8,
         )
         text_encoder = CLIPTextModelWithProjection(text_encoder_config)
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
diff --git a/tests/pipelines/amused/test_amused_inpaint.py b/tests/pipelines/amused/test_amused_inpaint.py
index d397f8d81297..d0c1ed09c706 100644
--- a/tests/pipelines/amused/test_amused_inpaint.py
+++ b/tests/pipelines/amused/test_amused_inpaint.py
@@ -42,17 +42,17 @@ class AmusedInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     def get_dummy_components(self):
         torch.manual_seed(0)
         transformer = UVit2DModel(
-            hidden_size=32,
+            hidden_size=8,
             use_bias=False,
             hidden_dropout=0.0,
-            cond_embed_dim=32,
+            cond_embed_dim=8,
             micro_cond_encode_dim=2,
             micro_cond_embed_dim=10,
-            encoder_hidden_size=32,
+            encoder_hidden_size=8,
             vocab_size=32,
-            codebook_size=32,
-            in_channels=32,
-            block_out_channels=32,
+            codebook_size=32,  # codebook size needs to be consistent with num_vq_embeddings for inpaint tests
+            in_channels=8,
+            block_out_channels=8,
             num_res_blocks=1,
             downsample=True,
             upsample=True,
@@ -60,7 +60,7 @@ def get_dummy_components(self):
             num_hidden_layers=1,
             num_attention_heads=1,
             attention_dropout=0.0,
-            intermediate_size=32,
+            intermediate_size=8,
             layer_norm_eps=1e-06,
             ln_elementwise_affine=True,
         )
@@ -68,17 +68,17 @@ def get_dummy_components(self):
         torch.manual_seed(0)
         vqvae = VQModel(
             act_fn="silu",
-            block_out_channels=[32],
+            block_out_channels=[8],
             down_block_types=[
                 "DownEncoderBlock2D",
             ],
             in_channels=3,
-            latent_channels=32,
-            layers_per_block=2,
-            norm_num_groups=32,
-            num_vq_embeddings=32,
+            latent_channels=8,
+            layers_per_block=1,
+            norm_num_groups=8,
+            num_vq_embeddings=32,  # reducing this to 16 or 8 -> RuntimeError: "cdist_cuda" not implemented for 'Half'
             out_channels=3,
-            sample_size=32,
+            sample_size=8,
             up_block_types=[
                 "UpDecoderBlock2D",
             ],
@@ -89,14 +89,14 @@ def get_dummy_components(self):
         text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
             eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=64,
+            hidden_size=8,
+            intermediate_size=8,
             layer_norm_eps=1e-05,
-            num_attention_heads=8,
-            num_hidden_layers=3,
+            num_attention_heads=1,
+            num_hidden_layers=1,
             pad_token_id=1,
             vocab_size=1000,
-            projection_dim=32,
+            projection_dim=8,
         )
         text_encoder = CLIPTextModelWithProjection(text_encoder_config)
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
diff --git a/tests/pipelines/animatediff/test_animatediff_sdxl.py b/tests/pipelines/animatediff/test_animatediff_sdxl.py
new file mode 100644
index 000000000000..2db0139154e9
--- /dev/null
+++ b/tests/pipelines/animatediff/test_animatediff_sdxl.py
@@ -0,0 +1,307 @@
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AnimateDiffSDXLPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    MotionAdapter,
+    UNet2DConditionModel,
+    UNetMotionModel,
+)
+from diffusers.utils import is_xformers_available, logging
+from diffusers.utils.testing_utils import torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+    IPAdapterTesterMixin,
+    PipelineTesterMixin,
+    SDFunctionTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+class AnimateDiffPipelineSDXLFastTests(
+    IPAdapterTesterMixin,
+    SDFunctionTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = AnimateDiffSDXLPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"})
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64, 128),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4, 8),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2, 4),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+            norm_num_groups=1,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="linear",
+            clip_sample=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        motion_adapter = MotionAdapter(
+            block_out_channels=(32, 64, 128),
+            motion_layers_per_block=2,
+            motion_norm_num_groups=2,
+            motion_num_attention_heads=4,
+            use_motion_mid_block=False,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "motion_adapter": motion_adapter,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 7.5,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_motion_unet_loading(self):
+        components = self.get_dummy_components()
+        pipe = AnimateDiffSDXLPipeline(**components)
+
+        assert isinstance(pipe.unet, UNetMotionModel)
+
+    @unittest.skip("Attention slicing is not enabled in this pipeline")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_inference_batch_single_identical(
+        self,
+        batch_size=2,
+        expected_max_diff=1e-4,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for components in pipe.components.values():
+            if hasattr(components, "set_default_attn_processor"):
+                components.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = self.get_generator(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        for name in self.batch_params:
+            if name not in inputs:
+                continue
+
+            value = inputs[name]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
+
+            else:
+                batched_inputs[name] = batch_size * [value]
+
+        if "generator" in inputs:
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        output = pipe(**inputs)
+        output_batch = pipe(**batched_inputs)
+
+        assert output_batch[0].shape[0] == batch_size
+
+        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+        assert max_diff < expected_max_diff
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.float16)
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    def test_prompt_embeds(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = inputs.pop("prompt")
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = pipe.encode_prompt(prompt)
+
+        pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = pipe(**inputs).frames[0]
+        output_without_offload = (
+            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
+        )
+
+        pipe.enable_xformers_memory_efficient_attention()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = pipe(**inputs).frames[0]
+        output_with_offload = (
+            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
+        )
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index f83dc8158e83..eddab54a3c03 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -66,16 +66,17 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     def get_dummy_components(self):
         torch.manual_seed(0)
         unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
+            block_out_channels=(8, 16),
+            layers_per_block=1,
+            norm_num_groups=8,
             sample_size=32,
             in_channels=4,
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=(32, 64),
+            cross_attention_dim=(8, 16),
             class_embed_type="simple_projection",
-            projection_class_embeddings_input_dim=32,
+            projection_class_embeddings_input_dim=8,
             class_embeddings_concat=True,
         )
         scheduler = DDIMScheduler(
@@ -87,9 +88,10 @@ def get_dummy_components(self):
         )
         torch.manual_seed(0)
         vae = AutoencoderKL(
-            block_out_channels=[32, 64],
+            block_out_channels=[8, 16],
             in_channels=1,
             out_channels=1,
+            norm_num_groups=8,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
             latent_channels=4,
@@ -98,14 +100,14 @@ def get_dummy_components(self):
         text_encoder_config = ClapTextConfig(
             bos_token_id=0,
             eos_token_id=2,
-            hidden_size=32,
+            hidden_size=8,
             intermediate_size=37,
             layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
+            num_attention_heads=1,
+            num_hidden_layers=1,
             pad_token_id=1,
             vocab_size=1000,
-            projection_dim=32,
+            projection_dim=8,
         )
         text_encoder = ClapTextModelWithProjection(text_encoder_config)
         tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
diff --git a/tests/pipelines/blipdiffusion/test_blipdiffusion.py b/tests/pipelines/blipdiffusion/test_blipdiffusion.py
index c5eaa3883d09..7e85cef65129 100644
--- a/tests/pipelines/blipdiffusion/test_blipdiffusion.py
+++ b/tests/pipelines/blipdiffusion/test_blipdiffusion.py
@@ -64,9 +64,9 @@ def get_dummy_components(self):
         torch.manual_seed(0)
         text_encoder_config = CLIPTextConfig(
             vocab_size=1000,
-            hidden_size=16,
-            intermediate_size=16,
-            projection_dim=16,
+            hidden_size=8,
+            intermediate_size=8,
+            projection_dim=8,
             num_hidden_layers=1,
             num_attention_heads=1,
             max_position_embeddings=77,
@@ -78,17 +78,17 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownEncoderBlock2D",),
             up_block_types=("UpDecoderBlock2D",),
-            block_out_channels=(32,),
+            block_out_channels=(8,),
+            norm_num_groups=8,
             layers_per_block=1,
             act_fn="silu",
             latent_channels=4,
-            norm_num_groups=16,
-            sample_size=16,
+            sample_size=8,
         )
 
         blip_vision_config = {
-            "hidden_size": 16,
-            "intermediate_size": 16,
+            "hidden_size": 8,
+            "intermediate_size": 8,
             "num_hidden_layers": 1,
             "num_attention_heads": 1,
             "image_size": 224,
@@ -98,32 +98,32 @@ def get_dummy_components(self):
 
         blip_qformer_config = {
             "vocab_size": 1000,
-            "hidden_size": 16,
+            "hidden_size": 8,
             "num_hidden_layers": 1,
             "num_attention_heads": 1,
-            "intermediate_size": 16,
+            "intermediate_size": 8,
             "max_position_embeddings": 512,
             "cross_attention_frequency": 1,
-            "encoder_hidden_size": 16,
+            "encoder_hidden_size": 8,
         }
         qformer_config = Blip2Config(
             vision_config=blip_vision_config,
             qformer_config=blip_qformer_config,
-            num_query_tokens=16,
+            num_query_tokens=8,
             tokenizer="hf-internal-testing/tiny-random-bert",
         )
         qformer = Blip2QFormerModel(qformer_config)
 
         unet = UNet2DConditionModel(
-            block_out_channels=(16, 32),
-            norm_num_groups=16,
+            block_out_channels=(8, 16),
+            norm_num_groups=8,
             layers_per_block=1,
             sample_size=16,
             in_channels=4,
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=16,
+            cross_attention_dim=8,
         )
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
@@ -189,7 +189,9 @@ def test_blipdiffusion(self):
 
         assert image.shape == (1, 16, 16, 4)
 
-        expected_slice = np.array([0.7096, 0.5900, 0.6703, 0.4032, 0.7766, 0.3629, 0.5447, 0.4149, 0.8172])
+        expected_slice = np.array(
+            [0.5329548, 0.8372512, 0.33269387, 0.82096875, 0.43657133, 0.3783, 0.5953028, 0.51934963, 0.42142007]
+        )
 
         assert (
             np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py
index 0f0654397a34..2078a592ceca 100644
--- a/tests/pipelines/ddim/test_ddim.py
+++ b/tests/pipelines/ddim/test_ddim.py
@@ -42,9 +42,10 @@ class DDIMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     def get_dummy_components(self):
         torch.manual_seed(0)
         unet = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            norm_num_groups=4,
+            sample_size=8,
             in_channels=3,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
@@ -79,10 +80,8 @@ def test_inference(self):
         image = pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1]
 
-        self.assertEqual(image.shape, (1, 32, 32, 3))
-        expected_slice = np.array(
-            [1.000e00, 5.717e-01, 4.717e-01, 1.000e00, 0.000e00, 1.000e00, 3.000e-04, 0.000e00, 9.000e-04]
-        )
+        self.assertEqual(image.shape, (1, 8, 8, 3))
+        expected_slice = np.array([0.0, 9.979e-01, 0.0, 9.999e-01, 9.986e-01, 9.991e-01, 7.106e-04, 0.0, 0.0])
         max_diff = np.abs(image_slice.flatten() - expected_slice).max()
         self.assertLessEqual(max_diff, 1e-3)
 
diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py
index c0cce3a2f237..f6d0821da4c2 100644
--- a/tests/pipelines/ddpm/test_ddpm.py
+++ b/tests/pipelines/ddpm/test_ddpm.py
@@ -30,9 +30,10 @@ class DDPMPipelineFastTests(unittest.TestCase):
     def dummy_uncond_unet(self):
         torch.manual_seed(0)
         model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            norm_num_groups=4,
+            sample_size=8,
             in_channels=3,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
@@ -58,10 +59,8 @@ def test_fast_inference(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [9.956e-01, 5.785e-01, 4.675e-01, 9.930e-01, 0.0, 1.000, 1.199e-03, 2.648e-04, 5.101e-04]
-        )
+        assert image.shape == (1, 8, 8, 3)
+        expected_slice = np.array([0.0, 0.9996672, 0.00329116, 1.0, 0.9995991, 1.0, 0.0060907, 0.00115037, 0.0])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -83,7 +82,7 @@ def test_inference_predict_sample(self):
         image_slice = image[0, -3:, -3:, -1]
         image_eps_slice = image_eps[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 32, 32, 3)
+        assert image.shape == (1, 8, 8, 3)
         tolerance = 1e-2 if torch_device != "mps" else 3e-2
         assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance
 
diff --git a/tests/pipelines/pixart_alpha/test_pixart.py b/tests/pipelines/pixart_alpha/test_pixart.py
index d981b55260c7..dd358af08395 100644
--- a/tests/pipelines/pixart_alpha/test_pixart.py
+++ b/tests/pipelines/pixart_alpha/test_pixart.py
@@ -324,10 +324,6 @@ def test_raises_warning_for_mask_feature(self):
     def test_inference_batch_single_identical(self):
         self._test_inference_batch_single_identical(expected_max_diff=1e-3)
 
-    # PixArt transformer model does not work with sequential offload so skip it for now
-    def test_sequential_offload_forward_pass_twice(self):
-        pass
-
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/pixart_sigma/test_pixart.py b/tests/pipelines/pixart_sigma/test_pixart.py
index 7b1d5e389f32..c0df15ae661d 100644
--- a/tests/pipelines/pixart_sigma/test_pixart.py
+++ b/tests/pipelines/pixart_sigma/test_pixart.py
@@ -308,10 +308,6 @@ def test_inference_with_multiple_images_per_prompt(self):
     def test_inference_batch_single_identical(self):
         self._test_inference_batch_single_identical(expected_max_diff=1e-3)
 
-    # PixArt transformer model does not work with sequential offload so skip it for now
-    def test_sequential_offload_forward_pass_twice(self):
-        pass
-
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 145e0012f8e9..137f1e696d93 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -1257,8 +1257,8 @@ def tearDown(self):
 
     def test_download_from_hub(self):
         ckpt_paths = [
-            "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt",
-            "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix_base.ckpt",
+            "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
+            "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors",
         ]
 
         for ckpt_path in ckpt_paths:
@@ -1271,7 +1271,7 @@ def test_download_from_hub(self):
         assert image_out.shape == (512, 512, 3)
 
     def test_download_local(self):
-        ckpt_filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.ckpt")
+        ckpt_filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.safetensors")
         config_filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-inference.yaml")
 
         pipe = StableDiffusionPipeline.from_single_file(
@@ -1285,7 +1285,7 @@ def test_download_local(self):
         assert image_out.shape == (512, 512, 3)
 
     def test_download_ckpt_diff_format_is_same(self):
-        ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt"
+        ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
 
         sf_pipe = StableDiffusionPipeline.from_single_file(ckpt_path)
         sf_pipe.scheduler = DDIMScheduler.from_config(sf_pipe.scheduler.config)
@@ -1310,7 +1310,7 @@ def test_download_ckpt_diff_format_is_same(self):
     def test_single_file_component_configs(self):
         pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 
-        ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt"
+        ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
         single_file_pipe = StableDiffusionPipeline.from_single_file(ckpt_path, load_safety_checker=True)
 
         for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items():
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 0c0a765f662d..c4b7a3b9187a 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1360,6 +1360,8 @@ def _test_attention_slicing_forward_pass(
         reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
     )
     def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
+        import accelerate
+
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
         for component in pipe.components.values():
@@ -1373,6 +1375,7 @@ def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
         output_without_offload = pipe(**inputs)[0]
 
         pipe.enable_sequential_cpu_offload()
+        assert pipe._execution_device.type == "cuda"
 
         inputs = self.get_dummy_inputs(generator_device)
         output_with_offload = pipe(**inputs)[0]
@@ -1380,11 +1383,48 @@ def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
         max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
         self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
 
+        # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
+        offloaded_modules = {
+            k: v
+            for k, v in pipe.components.items()
+            if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
+        }
+        # 1. all offloaded modules should be saved to cpu and moved to meta device
+        self.assertTrue(
+            all(v.device.type == "meta" for v in offloaded_modules.values()),
+            f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'meta']}",
+        )
+        # 2. all offloaded modules should have hook installed
+        self.assertTrue(
+            all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
+            f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
+        )
+        # 3. all offloaded modules should have correct hooks installed, should be either one of these two
+        #    - `AlignDevicesHook`
+        #    - a SequentialHook` that contains `AlignDevicesHook`
+        offloaded_modules_with_incorrect_hooks = {}
+        for k, v in offloaded_modules.items():
+            if hasattr(v, "_hf_hook"):
+                if isinstance(v._hf_hook, accelerate.hooks.SequentialHook):
+                    # if it is a `SequentialHook`, we loop through its `hooks` attribute to check if it only contains `AlignDevicesHook`
+                    for hook in v._hf_hook.hooks:
+                        if not isinstance(hook, accelerate.hooks.AlignDevicesHook):
+                            offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook.hooks[0])
+                elif not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook):
+                    offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
+
+        self.assertTrue(
+            len(offloaded_modules_with_incorrect_hooks) == 0,
+            f"Not installed correct hook: {offloaded_modules_with_incorrect_hooks}",
+        )
+
     @unittest.skipIf(
         torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
         reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
     )
     def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
+        import accelerate
+
         generator_device = "cpu"
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
@@ -1400,19 +1440,39 @@ def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
         output_without_offload = pipe(**inputs)[0]
 
         pipe.enable_model_cpu_offload()
+        assert pipe._execution_device.type == "cuda"
+
         inputs = self.get_dummy_inputs(generator_device)
         output_with_offload = pipe(**inputs)[0]
 
         max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
         self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
-        offloaded_modules = [
-            v
+
+        # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
+        offloaded_modules = {
+            k: v
             for k, v in pipe.components.items()
             if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
-        ]
-        (
-            self.assertTrue(all(v.device.type == "cpu" for v in offloaded_modules)),
-            f"Not offloaded: {[v for v in offloaded_modules if v.device.type != 'cpu']}",
+        }
+        # 1. check if all offloaded modules are saved to cpu
+        self.assertTrue(
+            all(v.device.type == "cpu" for v in offloaded_modules.values()),
+            f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'cpu']}",
+        )
+        # 2. check if all offloaded modules have hooks installed
+        self.assertTrue(
+            all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
+            f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
+        )
+        # 3. check if all offloaded modules have correct type of hooks installed, should be `CpuOffload`
+        offloaded_modules_with_incorrect_hooks = {}
+        for k, v in offloaded_modules.items():
+            if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.CpuOffload):
+                offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
+
+        self.assertTrue(
+            len(offloaded_modules_with_incorrect_hooks) == 0,
+            f"Not installed correct hook: {offloaded_modules_with_incorrect_hooks}",
         )
 
     @unittest.skipIf(
@@ -1444,16 +1504,24 @@ def test_cpu_offload_forward_pass_twice(self, expected_max_diff=2e-4):
         self.assertLess(
             max_diff, expected_max_diff, "running CPU offloading 2nd time should not affect the inference results"
         )
+
+        # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
         offloaded_modules = {
             k: v
             for k, v in pipe.components.items()
             if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
         }
+        # 1. check if all offloaded modules are saved to cpu
         self.assertTrue(
             all(v.device.type == "cpu" for v in offloaded_modules.values()),
             f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'cpu']}",
         )
-
+        # 2. check if all offloaded modules have hooks installed
+        self.assertTrue(
+            all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
+            f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
+        )
+        # 3. check if all offloaded modules have correct type of hooks installed, should be `CpuOffload`
         offloaded_modules_with_incorrect_hooks = {}
         for k, v in offloaded_modules.items():
             if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.CpuOffload):
@@ -1493,19 +1561,36 @@ def test_sequential_offload_forward_pass_twice(self, expected_max_diff=2e-4):
         self.assertLess(
             max_diff, expected_max_diff, "running sequential offloading second time should have the inference results"
         )
+
+        # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
         offloaded_modules = {
             k: v
             for k, v in pipe.components.items()
             if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
         }
+        # 1. check if all offloaded modules are moved to meta device
         self.assertTrue(
             all(v.device.type == "meta" for v in offloaded_modules.values()),
             f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'meta']}",
         )
+        # 2. check if all offloaded modules have hook installed
+        self.assertTrue(
+            all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
+            f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
+        )
+        # 3. check if all offloaded modules have correct hooks installed, should be either one of these two
+        #    - `AlignDevicesHook`
+        #    - a SequentialHook` that contains `AlignDevicesHook`
         offloaded_modules_with_incorrect_hooks = {}
         for k, v in offloaded_modules.items():
-            if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook):
-                offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
+            if hasattr(v, "_hf_hook"):
+                if isinstance(v._hf_hook, accelerate.hooks.SequentialHook):
+                    # if it is a `SequentialHook`, we loop through its `hooks` attribute to check if it only contains `AlignDevicesHook`
+                    for hook in v._hf_hook.hooks:
+                        if not isinstance(hook, accelerate.hooks.AlignDevicesHook):
+                            offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook.hooks[0])
+                elif not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook):
+                    offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
 
         self.assertTrue(
             len(offloaded_modules_with_incorrect_hooks) == 0,