diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml
index 2f73c66de829..2e9ac33d6b00 100644
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -19,7 +19,7 @@ env:
jobs:
setup_torch_cuda_pipeline_matrix:
name: Setup Torch Pipelines Matrix
- runs-on: ubuntu-latest
+ runs-on: diffusers/diffusers-pytorch-cpu
outputs:
pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
steps:
@@ -67,19 +67,19 @@ jobs:
fetch-depth: 2
- name: NVIDIA-SMI
run: nvidia-smi
-
+
- name: Install dependencies
run: |
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
python -m uv pip install -e [quality,test]
python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
python -m uv pip install pytest-reportlog
-
+
- name: Environment
run: |
python utils/print_env.py
-
- - name: Nightly PyTorch CUDA checkpoint (pipelines) tests
+
+ - name: Nightly PyTorch CUDA checkpoint (pipelines) tests
env:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -88,9 +88,9 @@ jobs:
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
- --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
+ --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
tests/pipelines/${{ matrix.module }}
-
+
- name: Failure short reports
if: ${{ failure() }}
run: |
@@ -103,7 +103,7 @@ jobs:
with:
name: pipeline_${{ matrix.module }}_test_reports
path: reports
-
+
- name: Generate Report and Notify Channel
if: always()
run: |
@@ -112,7 +112,7 @@ jobs:
run_nightly_tests_for_other_torch_modules:
name: Torch Non-Pipelines CUDA Nightly Tests
- runs-on: docker-gpu
+ runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
@@ -139,7 +139,7 @@ jobs:
run: python utils/print_env.py
- name: Run nightly PyTorch CUDA tests for non-pipeline modules
- if: ${{ matrix.module != 'examples'}}
+ if: ${{ matrix.module != 'examples'}}
env:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -148,7 +148,7 @@ jobs:
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_torch_${{ matrix.module }}_cuda \
- --report-log=tests_torch_${{ matrix.module }}_cuda.log \
+ --report-log=tests_torch_${{ matrix.module }}_cuda.log \
tests/${{ matrix.module }}
- name: Run nightly example tests with Torch
@@ -161,13 +161,13 @@ jobs:
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v --make-reports=examples_torch_cuda \
- --report-log=examples_torch_cuda.log \
+ --report-log=examples_torch_cuda.log \
examples/
- name: Failure short reports
if: ${{ failure() }}
run: |
- cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt
+ cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt
cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt
- name: Test suite reports artifacts
@@ -185,7 +185,7 @@ jobs:
run_lora_nightly_tests:
name: Nightly LoRA Tests with PEFT and TORCH
- runs-on: docker-gpu
+ runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
@@ -218,13 +218,13 @@ jobs:
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_torch_lora_cuda \
- --report-log=tests_torch_lora_cuda.log \
+ --report-log=tests_torch_lora_cuda.log \
tests/lora
-
+
- name: Failure short reports
if: ${{ failure() }}
run: |
- cat reports/tests_torch_lora_cuda_stats.txt
+ cat reports/tests_torch_lora_cuda_stats.txt
cat reports/tests_torch_lora_cuda_failures_short.txt
- name: Test suite reports artifacts
@@ -239,12 +239,12 @@ jobs:
run: |
pip install slack_sdk tabulate
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
-
+
run_flax_tpu_tests:
name: Nightly Flax TPU Tests
runs-on: docker-tpu
if: github.event_name == 'schedule'
-
+
container:
image: diffusers/diffusers-flax-tpu
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
@@ -274,7 +274,7 @@ jobs:
python -m pytest -n 0 \
-s -v -k "Flax" \
--make-reports=tests_flax_tpu \
- --report-log=tests_flax_tpu.log \
+ --report-log=tests_flax_tpu.log \
tests/
- name: Failure short reports
@@ -298,11 +298,11 @@ jobs:
run_nightly_onnx_tests:
name: Nightly ONNXRuntime CUDA tests on Ubuntu
- runs-on: docker-gpu
+ runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-onnxruntime-cuda
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
-
+
steps:
- name: Checkout diffusers
uses: actions/checkout@v3
@@ -321,7 +321,7 @@ jobs:
- name: Environment
run: python utils/print_env.py
-
+
- name: Run nightly ONNXRuntime CUDA tests
env:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
@@ -329,7 +329,7 @@ jobs:
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "Onnx" \
--make-reports=tests_onnx_cuda \
- --report-log=tests_onnx_cuda.log \
+ --report-log=tests_onnx_cuda.log \
tests/
- name: Failure short reports
@@ -344,7 +344,7 @@ jobs:
with:
name: ${{ matrix.config.report }}_test_reports
path: reports
-
+
- name: Generate Report and Notify Channel
if: always()
run: |
diff --git a/.github/workflows/pr_test_fetcher.yml b/.github/workflows/pr_test_fetcher.yml
index 4dbb118c6092..7bbaaf2240a2 100644
--- a/.github/workflows/pr_test_fetcher.yml
+++ b/.github/workflows/pr_test_fetcher.yml
@@ -15,7 +15,7 @@ concurrency:
jobs:
setup_pr_tests:
name: Setup PR Tests
- runs-on: docker-cpu
+ runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
container:
image: diffusers/diffusers-pytorch-cpu
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -73,7 +73,7 @@ jobs:
max-parallel: 2
matrix:
modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
- runs-on: docker-cpu
+ runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
container:
image: diffusers/diffusers-pytorch-cpu
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -123,7 +123,7 @@ jobs:
config:
- name: Hub tests for models, schedulers, and pipelines
framework: hub_tests_pytorch
- runner: docker-cpu
+ runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
image: diffusers/diffusers-pytorch-cpu
report: torch_hub
diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index a6cb123a7035..d071af2b0be2 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -21,7 +21,9 @@ env:
jobs:
setup_torch_cuda_pipeline_matrix:
name: Setup Torch Pipelines CUDA Slow Tests Matrix
- runs-on: ubuntu-latest
+ runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+ container:
+ image: diffusers/diffusers-pytorch-cpu
outputs:
pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
steps:
@@ -29,14 +31,13 @@ jobs:
uses: actions/checkout@v3
with:
fetch-depth: 2
- - name: Set up Python
- uses: actions/setup-python@v4
- with:
- python-version: "3.8"
- name: Install dependencies
run: |
- pip install -e .
- pip install huggingface_hub
+ python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+ python -m uv pip install -e [quality,test]
+ - name: Environment
+ run: |
+ python utils/print_env.py
- name: Fetch Pipeline Matrix
id: fetch_pipeline_matrix
run: |
@@ -60,7 +61,7 @@ jobs:
runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-cuda
- options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 --privileged
+ options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
steps:
- name: Checkout diffusers
uses: actions/checkout@v3
@@ -114,10 +115,10 @@ jobs:
torch_cuda_tests:
name: Torch CUDA Tests
- runs-on: docker-gpu
+ runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-cuda
- options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+ options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
defaults:
run:
shell: bash
@@ -166,10 +167,10 @@ jobs:
peft_cuda_tests:
name: PEFT CUDA Tests
- runs-on: docker-gpu
+ runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-cuda
- options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+ options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
defaults:
run:
shell: bash
@@ -219,7 +220,7 @@ jobs:
runs-on: docker-tpu
container:
image: diffusers/diffusers-flax-tpu
- options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
+ options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --privileged
defaults:
run:
shell: bash
@@ -263,10 +264,10 @@ jobs:
onnx_cuda_tests:
name: ONNX CUDA Tests
- runs-on: docker-gpu
+ runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-onnxruntime-cuda
- options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+ options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
defaults:
run:
shell: bash
@@ -311,11 +312,11 @@ jobs:
run_torch_compile_tests:
name: PyTorch Compile CUDA tests
- runs-on: docker-gpu
+ runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-compile-cuda
- options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Checkout diffusers
@@ -352,11 +353,11 @@ jobs:
run_xformers_tests:
name: PyTorch xformers CUDA tests
- runs-on: docker-gpu
+ runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-xformers-cuda
- options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Checkout diffusers
@@ -393,11 +394,11 @@ jobs:
run_examples_tests:
name: Examples PyTorch CUDA tests on Ubuntu
- runs-on: docker-gpu
+ runs-on: [single-gpu, nvidia-gpu, t4, ci]
container:
image: diffusers/diffusers-pytorch-cuda
- options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Checkout diffusers
diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml
new file mode 100644
index 000000000000..e5bbdd64f549
--- /dev/null
+++ b/.github/workflows/ssh-runner.yml
@@ -0,0 +1,46 @@
+name: SSH into runners
+
+on:
+ workflow_dispatch:
+ inputs:
+ runner_type:
+ description: 'Type of runner to test (a10 or t4)'
+ required: true
+ docker_image:
+ description: 'Name of the Docker image'
+ required: true
+
+env:
+ IS_GITHUB_CI: "1"
+ HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+ HF_HOME: /mnt/cache
+ DIFFUSERS_IS_CI: yes
+ OMP_NUM_THREADS: 8
+ MKL_NUM_THREADS: 8
+ RUN_SLOW: yes
+
+jobs:
+ ssh_runner:
+ name: "SSH"
+ runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
+ container:
+ image: ${{ github.event.inputs.docker_image }}
+ options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+
+ steps:
+ - name: Checkout diffusers
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 2
+
+ - name: NVIDIA-SMI
+ run: |
+ nvidia-smi
+
+ - name: Tailscale # In order to be able to SSH when a test fails
+ uses: huggingface/tailscale-action@v1
+ with:
+ authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
+ slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
+ slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+ waitForSSH: true
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 140bdfad3a89..1c21d4cd9f74 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -62,13 +62,11 @@
- local: using-diffusers/callback
title: Pipeline callbacks
- local: using-diffusers/reusing_seeds
- title: Improve image quality with deterministic generation
- - local: using-diffusers/control_brightness
- title: Control image brightness
+ title: Reproducible pipelines
+ - local: using-diffusers/image_quality
+ title: Controlling image quality
- local: using-diffusers/weighted_prompts
title: Prompt techniques
- - local: using-diffusers/freeu
- title: Improve generation quality with FreeU
title: Inference techniques
- sections:
- local: using-diffusers/sdxl
@@ -83,22 +81,14 @@
title: ControlNet
- local: using-diffusers/t2i_adapter
title: T2I-Adapter
+ - local: using-diffusers/inference_with_lcm
+ title: Latent Consistency Model
- local: using-diffusers/textual_inversion_inference
title: Textual inversion
- local: using-diffusers/shap-e
title: Shap-E
- local: using-diffusers/diffedit
title: DiffEdit
- - local: using-diffusers/reproducibility
- title: Create reproducible pipelines
- - local: using-diffusers/custom_pipeline_examples
- title: Community pipelines
- - local: using-diffusers/contribute_pipeline
- title: Contribute a community pipeline
- - local: using-diffusers/inference_with_lcm_lora
- title: Latent Consistency Model-LoRA
- - local: using-diffusers/inference_with_lcm
- title: Latent Consistency Model
- local: using-diffusers/inference_with_tcd_lora
title: Trajectory Consistency Distillation-LoRA
- local: using-diffusers/svd
@@ -149,8 +139,6 @@
- sections:
- local: optimization/fp16
title: Speed up inference
- - local: using-diffusers/distilled_sd
- title: Distilled Stable Diffusion inference
- local: optimization/memory
title: Reduce memory usage
- local: optimization/torch2.0
diff --git a/docs/source/en/api/attnprocessor.md b/docs/source/en/api/attnprocessor.md
index ab89d4d260f0..f586e9b08f2c 100644
--- a/docs/source/en/api/attnprocessor.md
+++ b/docs/source/en/api/attnprocessor.md
@@ -55,3 +55,6 @@ An attention processor is a class for applying different types of attention mech
## XFormersAttnProcessor
[[autodoc]] models.attention_processor.XFormersAttnProcessor
+
+## AttnProcessorNPU
+[[autodoc]] models.attention_processor.AttnProcessorNPU
diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md
index 913529e6ebdc..425764541590 100644
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -101,6 +101,53 @@ AnimateDiff tends to work better with finetuned Stable Diffusion models. If you
+### AnimateDiffSDXLPipeline
+
+AnimateDiff can also be used with SDXL models. This is currently an experimental feature as only a beta release of the motion adapter checkpoint is available.
+
+```python
+import torch
+from diffusers.models import MotionAdapter
+from diffusers import AnimateDiffSDXLPipeline, DDIMScheduler
+from diffusers.utils import export_to_gif
+
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-sdxl-beta", torch_dtype=torch.float16)
+
+model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+scheduler = DDIMScheduler.from_pretrained(
+ model_id,
+ subfolder="scheduler",
+ clip_sample=False,
+ timestep_spacing="linspace",
+ beta_schedule="linear",
+ steps_offset=1,
+)
+pipe = AnimateDiffSDXLPipeline.from_pretrained(
+ model_id,
+ motion_adapter=adapter,
+ scheduler=scheduler,
+ torch_dtype=torch.float16,
+ variant="fp16",
+).to("cuda")
+
+# enable memory savings
+pipe.enable_vae_slicing()
+pipe.enable_vae_tiling()
+
+output = pipe(
+ prompt="a panda surfing in the ocean, realistic, high quality",
+ negative_prompt="low quality, worst quality",
+ num_inference_steps=20,
+ guidance_scale=8,
+ width=1024,
+ height=1024,
+ num_frames=16,
+)
+
+frames = output.frames[0]
+export_to_gif(frames, "animation.gif")
+```
+
### AnimateDiffVideoToVideoPipeline
AnimateDiff can also be used to generate visually similar videos or enable style/character/background or other edits starting from an initial video, allowing you to seamlessly explore creative possibilities.
@@ -522,6 +569,12 @@ export_to_gif(frames, "animatelcm-motion-lora.gif")
- all
- __call__
+## AnimateDiffSDXLPipeline
+
+[[autodoc]] AnimateDiffSDXLPipeline
+ - all
+ - __call__
+
## AnimateDiffVideoToVideoPipeline
[[autodoc]] AnimateDiffVideoToVideoPipeline
diff --git a/docs/source/en/api/pipelines/overview.md b/docs/source/en/api/pipelines/overview.md
index cd1232a90d6e..e7b8bf4936c0 100644
--- a/docs/source/en/api/pipelines/overview.md
+++ b/docs/source/en/api/pipelines/overview.md
@@ -97,6 +97,11 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
- to
- components
+
+[[autodoc]] pipelines.StableDiffusionMixin.enable_freeu
+
+[[autodoc]] pipelines.StableDiffusionMixin.disable_freeu
+
## FlaxDiffusionPipeline
[[autodoc]] pipelines.pipeline_flax_utils.FlaxDiffusionPipeline
diff --git a/docs/source/en/api/utilities.md b/docs/source/en/api/utilities.md
index 71253db215ab..d4f4d7d7964f 100644
--- a/docs/source/en/api/utilities.md
+++ b/docs/source/en/api/utilities.md
@@ -37,3 +37,7 @@ Utility and helper functions for working with 🤗 Diffusers.
## make_image_grid
[[autodoc]] utils.make_image_grid
+
+## randn_tensor
+
+[[autodoc]] utils.torch_utils.randn_tensor
diff --git a/docs/source/en/conceptual/contribution.md b/docs/source/en/conceptual/contribution.md
index 24ac52ba19c9..cc2e0ae07b2c 100644
--- a/docs/source/en/conceptual/contribution.md
+++ b/docs/source/en/conceptual/contribution.md
@@ -198,38 +198,81 @@ Anything displayed on [the official Diffusers doc page](https://huggingface.co/d
Please have a look at [this page](https://github.com/huggingface/diffusers/tree/main/docs) on how to verify changes made to the documentation locally.
-
### 6. Contribute a community pipeline
-[Pipelines](https://huggingface.co/docs/diffusers/api/pipelines/overview) are usually the first point of contact between the Diffusers library and the user.
-Pipelines are examples of how to use Diffusers [models](https://huggingface.co/docs/diffusers/api/models/overview) and [schedulers](https://huggingface.co/docs/diffusers/api/schedulers/overview).
-We support two types of pipelines:
+> [!TIP]
+> Read the [Community pipelines](../using-diffusers/custom_pipeline_overview#community-pipelines) guide to learn more about the difference between a GitHub and Hugging Face Hub community pipeline. If you're interested in why we have community pipelines, take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) (basically, we can't maintain all the possible ways diffusion models can be used for inference but we also don't want to prevent the community from building them).
+
+Contributing a community pipeline is a great way to share your creativity and work with the community. It lets you build on top of the [`DiffusionPipeline`] so that anyone can load and use it by setting the `custom_pipeline` parameter. This section will walk you through how to create a simple pipeline where the UNet only does a single forward pass and calls the scheduler once (a "one-step" pipeline).
+
+1. Create a one_step_unet.py file for your community pipeline. This file can contain whatever package you want to use as long as it's installed by the user. Make sure you only have one pipeline class that inherits from [`DiffusionPipeline`] to load model weights and the scheduler configuration from the Hub. Add a UNet and scheduler to the `__init__` function.
+
+ You should also add the `register_modules` function to ensure your pipeline and its components can be saved with [`~DiffusionPipeline.save_pretrained`].
+
+```py
+from diffusers import DiffusionPipeline
+import torch
+
+class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+ def __init__(self, unet, scheduler):
+ super().__init__()
+
+ self.register_modules(unet=unet, scheduler=scheduler)
+```
+
+1. In the forward pass (which we recommend defining as `__call__`), you can add any feature you'd like. For the "one-step" pipeline, create a random image and call the UNet and scheduler once by setting `timestep=1`.
+
+```py
+ from diffusers import DiffusionPipeline
+ import torch
+
+ class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+ def __init__(self, unet, scheduler):
+ super().__init__()
+
+ self.register_modules(unet=unet, scheduler=scheduler)
-- Official Pipelines
-- Community Pipelines
+ def __call__(self):
+ image = torch.randn(
+ (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
+ )
+ timestep = 1
+
+ model_output = self.unet(image, timestep).sample
+ scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
+
+ return scheduler_output
+```
-Both official and community pipelines follow the same design and consist of the same type of components.
+Now you can run the pipeline by passing a UNet and scheduler to it or load pretrained weights if the pipeline structure is identical.
+
+```py
+from diffusers import DDPMScheduler, UNet2DModel
+
+scheduler = DDPMScheduler()
+unet = UNet2DModel()
+
+pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
+output = pipeline()
+# load pretrained weights
+pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
+output = pipeline()
+```
-Official pipelines are tested and maintained by the core maintainers of Diffusers. Their code
-resides in [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).
-In contrast, community pipelines are contributed and maintained purely by the **community** and are **not** tested.
-They reside in [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and while they can be accessed via the [PyPI diffusers package](https://pypi.org/project/diffusers/), their code is not part of the PyPI distribution.
+You can either share your pipeline as a GitHub community pipeline or Hub community pipeline.
-The reason for the distinction is that the core maintainers of the Diffusers library cannot maintain and test all
-possible ways diffusion models can be used for inference, but some of them may be of interest to the community.
-Officially released diffusion pipelines,
-such as Stable Diffusion are added to the core src/diffusers/pipelines package which ensures
-high quality of maintenance, no backward-breaking code changes, and testing.
-More bleeding edge pipelines should be added as community pipelines. If usage for a community pipeline is high, the pipeline can be moved to the official pipelines upon request from the community. This is one of the ways we strive to be a community-driven library.
+
+
-To add a community pipeline, one should add a .py file to [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and adapt the [examples/community/README.md](https://github.com/huggingface/diffusers/tree/main/examples/community/README.md) to include an example of the new pipeline.
+Share your GitHub pipeline by opening a pull request on the Diffusers [repository](https://github.com/huggingface/diffusers) and add the one_step_unet.py file to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder.
-An example can be seen [here](https://github.com/huggingface/diffusers/pull/2400).
+
+
-Community pipeline PRs are only checked at a superficial level and ideally they should be maintained by their original authors.
+Share your Hub pipeline by creating a model repository on the Hub and uploading the one_step_unet.py file to it.
-Contributing a community pipeline is a great way to understand how Diffusers models and schedulers work. Having contributed a community pipeline is usually the first stepping stone to contributing an official pipeline to the
-core package.
+
+
### 7. Contribute to training examples
diff --git a/docs/source/en/optimization/fp16.md b/docs/source/en/optimization/fp16.md
index 7a2cf934985c..b21b61368826 100644
--- a/docs/source/en/optimization/fp16.md
+++ b/docs/source/en/optimization/fp16.md
@@ -12,27 +12,23 @@ specific language governing permissions and limitations under the License.
# Speed up inference
-There are several ways to optimize 🤗 Diffusers for inference speed. As a general rule of thumb, we recommend using either [xFormers](xformers) or `torch.nn.functional.scaled_dot_product_attention` in PyTorch 2.0 for their memory-efficient attention.
+There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attetntion](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.
-
+> [!TIP]
+> Optimizing for inference speed or reduced memory usage can lead to improved performance in the other category, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about lowering memory usage in the [Reduce memory usage](memory) guide.
-In many cases, optimizing for speed or memory leads to improved performance in the other, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about preserving memory in the [Reduce memory usage](memory) guide.
+The inference times below are obtained from generating a single 512x512 image from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps on a NVIDIA A100.
-
+| setup | latency | speed-up |
+|----------|---------|----------|
+| baseline | 5.27s | x1 |
+| tf32 | 4.14s | x1.27 |
+| fp16 | 3.51s | x1.50 |
+| combined | 3.41s | x1.54 |
-The results below are obtained from generating a single 512x512 image from the prompt `a photo of an astronaut riding a horse on mars` with 50 DDIM steps on a Nvidia Titan RTX, demonstrating the speed-up you can expect.
+## TensorFloat-32
-| | latency | speed-up |
-| ---------------- | ------- | ------- |
-| original | 9.50s | x1 |
-| fp16 | 3.61s | x2.63 |
-| channels last | 3.30s | x2.88 |
-| traced UNet | 3.21s | x2.96 |
-| memory efficient attention | 2.63s | x3.61 |
-
-## Use TensorFloat-32
-
-On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (TF32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables TF32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling TF32 for matrix multiplications. It can significantly speeds up computations with typically negligible loss in numerical accuracy.
+On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (tf32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables tf32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling tf32 for matrix multiplications. It can significantly speed up computations with typically negligible loss in numerical accuracy.
```python
import torch
@@ -40,11 +36,11 @@ import torch
torch.backends.cuda.matmul.allow_tf32 = True
```
-You can learn more about TF32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.
+Learn more about tf32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.
## Half-precision weights
-To save GPU memory and get more speed, try loading and running the model weights directly in half-precision or float16:
+To save GPU memory and get more speed, set `torch_dtype=torch.float16` to load and run the model weights directly with half-precision weights.
```Python
import torch
@@ -56,19 +52,76 @@ pipe = DiffusionPipeline.from_pretrained(
use_safetensors=True,
)
pipe = pipe.to("cuda")
-
-prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt).images[0]
```
-
+> [!WARNING]
+> Don't use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.
-Don't use [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.
+## Distilled model
-
+You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size by 51% and improve latency on CPU/GPU by 43%. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model.
-## Distilled model
+> [!TIP]
+> Read the [Open-sourcing Knowledge Distillation Code and Weights of SD-Small and SD-Tiny](https://huggingface.co/blog/sd_distillation) blog post to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model.
-You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model.
+The inference times below are obtained from generating 4 images from the prompt "a photo of an astronaut riding a horse on mars" with 25 PNDM steps on a NVIDIA A100. Each generation is repeated 3 times with the distilled Stable Diffusion v1.4 model by [Nota AI](https://hf.co/nota-ai).
+
+| setup | latency | speed-up |
+|------------------------------|---------|----------|
+| baseline | 6.37s | x1 |
+| distilled | 4.18s | x1.52 |
+| distilled + tiny autoencoder | 3.83s | x1.66 |
+
+Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model.
+
+```py
+from diffusers import StableDiffusionPipeline
+import torch
+
+distilled = StableDiffusionPipeline.from_pretrained(
+ "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+prompt = "a golden vase with different flowers"
+generator = torch.manual_seed(2023)
+image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
+image
+```
+
+
+
+
+
original Stable Diffusion
+
+
+
+
distilled Stable Diffusion
+
+
+
+### Tiny AutoEncoder
+
+To speed inference up even more, replace the autoencoder with a [distilled version](https://huggingface.co/sayakpaul/taesdxl-diffusers) of it.
+
+```py
+import torch
+from diffusers import AutoencoderTiny, StableDiffusionPipeline
+
+distilled = StableDiffusionPipeline.from_pretrained(
+ "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+distilled.vae = AutoencoderTiny.from_pretrained(
+ "sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+
+prompt = "a golden vase with different flowers"
+generator = torch.manual_seed(2023)
+image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
+image
+```
-Learn more about in the [Distilled Stable Diffusion inference](../using-diffusers/distilled_sd) guide!
+
+
+
+
distilled Stable Diffusion + Tiny AutoEncoder
+
+
diff --git a/docs/source/en/stable_diffusion.md b/docs/source/en/stable_diffusion.md
index 877a4ac70823..db4953ebbffd 100644
--- a/docs/source/en/stable_diffusion.md
+++ b/docs/source/en/stable_diffusion.md
@@ -49,7 +49,7 @@ One of the simplest ways to speed up inference is to place the pipeline on a GPU
pipeline = pipeline.to("cuda")
```
-To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reproducibility):
+To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reusing_seeds):
```python
import torch
diff --git a/docs/source/en/training/kandinsky.md b/docs/source/en/training/kandinsky.md
index 2caec1035fa9..a1854d76c492 100644
--- a/docs/source/en/training/kandinsky.md
+++ b/docs/source/en/training/kandinsky.md
@@ -205,7 +205,7 @@ model_pred = unet(noisy_latents, timesteps, None, added_cond_kwargs=added_cond_k
Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀
-You'll train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon, but you can also create and train on your own dataset by following the [Create a dataset for training](create_dataset) guide. Set the environment variable `DATASET_NAME` to the name of the dataset on the Hub or if you're training on your own files, set the environment variable `TRAIN_DIR` to a path to your dataset.
+You'll train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters, but you can also create and train on your own dataset by following the [Create a dataset for training](create_dataset) guide. Set the environment variable `DATASET_NAME` to the name of the dataset on the Hub or if you're training on your own files, set the environment variable `TRAIN_DIR` to a path to your dataset.
If you’re training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
@@ -219,7 +219,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb`
```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch --mixed_precision="fp16" train_text_to_image_prior.py \
--dataset_name=$DATASET_NAME \
@@ -232,17 +232,17 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_prior.py \
--checkpoints_total_limit=3 \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
- --validation_prompts="A robot pokemon, 4k photo" \
+ --validation_prompts="A robot naruto, 4k photo" \
--report_to="wandb" \
--push_to_hub \
- --output_dir="kandi2-prior-pokemon-model"
+ --output_dir="kandi2-prior-naruto-model"
```
```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch --mixed_precision="fp16" train_text_to_image_decoder.py \
--dataset_name=$DATASET_NAME \
@@ -256,10 +256,10 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_decoder.py \
--checkpoints_total_limit=3 \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
- --validation_prompts="A robot pokemon, 4k photo" \
+ --validation_prompts="A robot naruto, 4k photo" \
--report_to="wandb" \
--push_to_hub \
- --output_dir="kandi2-decoder-pokemon-model"
+ --output_dir="kandi2-decoder-naruto-model"
```
@@ -279,7 +279,7 @@ prior_components = {"prior_" + k: v for k,v in prior_pipeline.components.items()
pipeline = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", **prior_components, torch_dtype=torch.float16)
pipe.enable_model_cpu_offload()
-prompt="A robot pokemon, 4k photo"
+prompt="A robot naruto, 4k photo"
image = pipeline(prompt=prompt, negative_prompt=negative_prompt).images[0]
```
@@ -299,7 +299,7 @@ import torch
pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torch_dtype=torch.float16)
pipeline.enable_model_cpu_offload()
-prompt="A robot pokemon, 4k photo"
+prompt="A robot naruto, 4k photo"
image = pipeline(prompt=prompt).images[0]
```
@@ -313,7 +313,7 @@ unet = UNet2DConditionModel.from_pretrained("path/to/saved/model" + "/checkpoint
pipeline = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", unet=unet, torch_dtype=torch.float16)
pipeline.enable_model_cpu_offload()
-image = pipeline(prompt="A robot pokemon, 4k photo").images[0]
+image = pipeline(prompt="A robot naruto, 4k photo").images[0]
```
diff --git a/docs/source/en/training/lora.md b/docs/source/en/training/lora.md
index 78ac8a140e7c..737e6f0dfc32 100644
--- a/docs/source/en/training/lora.md
+++ b/docs/source/en/training/lora.md
@@ -170,7 +170,7 @@ Aside from setting up the LoRA layers, the training script is more or less the s
Once you've made all your changes or you're okay with the default configuration, you're ready to launch the training script! 🚀
-Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate our own Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository:
+Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository:
- saved model checkpoints
- `pytorch_lora_weights.safetensors` (the trained LoRA weights)
@@ -185,9 +185,9 @@ A full training run takes ~5 hours on a 2080 Ti GPU with 11GB of VRAM.
```bash
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export OUTPUT_DIR="/sddata/finetune/lora/pokemon"
-export HUB_MODEL_ID="pokemon-lora"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export OUTPUT_DIR="/sddata/finetune/lora/naruto"
+export HUB_MODEL_ID="naruto-lora"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch --mixed_precision="fp16" train_text_to_image_lora.py \
--pretrained_model_name_or_path=$MODEL_NAME \
@@ -208,7 +208,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_lora.py \
--hub_model_id=${HUB_MODEL_ID} \
--report_to=wandb \
--checkpointing_steps=500 \
- --validation_prompt="A pokemon with blue eyes." \
+ --validation_prompt="A naruto with blue eyes." \
--seed=1337
```
@@ -220,7 +220,7 @@ import torch
pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
pipeline.load_lora_weights("path/to/lora/model", weight_name="pytorch_lora_weights.safetensors")
-image = pipeline("A pokemon with blue eyes").images[0]
+image = pipeline("A naruto with blue eyes").images[0]
```
## Next steps
diff --git a/docs/source/en/training/sdxl.md b/docs/source/en/training/sdxl.md
index 0e51e720b48c..78178047d9fd 100644
--- a/docs/source/en/training/sdxl.md
+++ b/docs/source/en/training/sdxl.md
@@ -176,7 +176,7 @@ If you want to learn more about how the training loop works, check out the [Unde
Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀
-Let’s train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and the dataset (either from the Hub or a local path). You should also specify a VAE other than the SDXL VAE (either from the Hub or a local path) with `VAE_NAME` to avoid numerical instabilities.
+Let’s train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and the dataset (either from the Hub or a local path). You should also specify a VAE other than the SDXL VAE (either from the Hub or a local path) with `VAE_NAME` to avoid numerical instabilities.
@@ -187,7 +187,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb`
```bash
export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch train_text_to_image_sdxl.py \
--pretrained_model_name_or_path=$MODEL_NAME \
@@ -211,7 +211,7 @@ accelerate launch train_text_to_image_sdxl.py \
--validation_prompt="a cute Sundar Pichai creature" \
--validation_epochs 5 \
--checkpointing_steps=5000 \
- --output_dir="sdxl-pokemon-model" \
+ --output_dir="sdxl-naruto-model" \
--push_to_hub
```
@@ -226,9 +226,9 @@ import torch
pipeline = DiffusionPipeline.from_pretrained("path/to/your/model", torch_dtype=torch.float16).to("cuda")
-prompt = "A pokemon with green eyes and red legs."
+prompt = "A naruto with green eyes and red legs."
image = pipeline(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
-image.save("pokemon.png")
+image.save("naruto.png")
```
@@ -244,11 +244,11 @@ import torch_xla.core.xla_model as xm
device = xm.xla_device()
pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0").to(device)
-prompt = "A pokemon with green eyes and red legs."
+prompt = "A naruto with green eyes and red legs."
start = time()
image = pipeline(prompt, num_inference_steps=inference_steps).images[0]
print(f'Compilation time is {time()-start} sec')
-image.save("pokemon.png")
+image.save("naruto.png")
start = time()
image = pipeline(prompt, num_inference_steps=inference_steps).images[0]
diff --git a/docs/source/en/training/text2image.md b/docs/source/en/training/text2image.md
index d5c772c9db86..f69e9a710e8f 100644
--- a/docs/source/en/training/text2image.md
+++ b/docs/source/en/training/text2image.md
@@ -158,7 +158,7 @@ Once you've made all your changes or you're okay with the default configuration,
-Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon. Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path). If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
+Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path). If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
@@ -168,7 +168,7 @@ To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment va
```bash
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export dataset_name="lambdalabs/pokemon-blip-captions"
+export dataset_name="lambdalabs/naruto-blip-captions"
accelerate launch --mixed_precision="fp16" train_text_to_image.py \
--pretrained_model_name_or_path=$MODEL_NAME \
@@ -183,7 +183,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image.py \
--max_grad_norm=1 \
--enable_xformers_memory_efficient_attention
--lr_scheduler="constant" --lr_warmup_steps=0 \
- --output_dir="sd-pokemon-model" \
+ --output_dir="sd-naruto-model" \
--push_to_hub
```
@@ -202,7 +202,7 @@ To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment va
```bash
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export dataset_name="lambdalabs/pokemon-blip-captions"
+export dataset_name="lambdalabs/naruto-blip-captions"
python train_text_to_image_flax.py \
--pretrained_model_name_or_path=$MODEL_NAME \
@@ -212,7 +212,7 @@ python train_text_to_image_flax.py \
--max_train_steps=15000 \
--learning_rate=1e-05 \
--max_grad_norm=1 \
- --output_dir="sd-pokemon-model" \
+ --output_dir="sd-naruto-model" \
--push_to_hub
```
@@ -231,7 +231,7 @@ import torch
pipeline = StableDiffusionPipeline.from_pretrained("path/to/saved_model", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
image = pipeline(prompt="yoda").images[0]
-image.save("yoda-pokemon.png")
+image.save("yoda-naruto.png")
```
@@ -246,7 +246,7 @@ from diffusers import FlaxStableDiffusionPipeline
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained("path/to/saved_model", dtype=jax.numpy.bfloat16)
-prompt = "yoda pokemon"
+prompt = "yoda naruto"
prng_seed = jax.random.PRNGKey(0)
num_inference_steps = 50
@@ -261,7 +261,7 @@ prompt_ids = shard(prompt_ids)
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
-image.save("yoda-pokemon.png")
+image.save("yoda-naruto.png")
```
diff --git a/docs/source/en/training/wuerstchen.md b/docs/source/en/training/wuerstchen.md
index c8d2842eb833..cd190639b865 100644
--- a/docs/source/en/training/wuerstchen.md
+++ b/docs/source/en/training/wuerstchen.md
@@ -131,7 +131,7 @@ If you want to learn more about how the training loop works, check out the [Unde
Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀
-Set the `DATASET_NAME` environment variable to the dataset name from the Hub. This guide uses the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset, but you can create and train on your own datasets as well (see the [Create a dataset for training](create_dataset) guide).
+Set the `DATASET_NAME` environment variable to the dataset name from the Hub. This guide uses the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset, but you can create and train on your own datasets as well (see the [Create a dataset for training](create_dataset) guide).
@@ -140,7 +140,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb`
```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch train_text_to_image_prior.py \
--mixed_precision="fp16" \
@@ -156,10 +156,10 @@ accelerate launch train_text_to_image_prior.py \
--checkpoints_total_limit=3 \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
- --validation_prompts="A robot pokemon, 4k photo" \
+ --validation_prompts="A robot naruto, 4k photo" \
--report_to="wandb" \
--push_to_hub \
- --output_dir="wuerstchen-prior-pokemon-model"
+ --output_dir="wuerstchen-prior-naruto-model"
```
Once training is complete, you can use your newly trained model for inference!
@@ -171,7 +171,7 @@ from diffusers.pipelines.wuerstchen import DEFAULT_STAGE_C_TIMESTEPS
pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torch_dtype=torch.float16).to("cuda")
-caption = "A cute bird pokemon holding a shield"
+caption = "A cute bird naruto holding a shield"
images = pipeline(
caption,
width=1024,
diff --git a/docs/source/en/using-diffusers/contribute_pipeline.md b/docs/source/en/using-diffusers/contribute_pipeline.md
deleted file mode 100644
index e9cf1ed1ce02..000000000000
--- a/docs/source/en/using-diffusers/contribute_pipeline.md
+++ /dev/null
@@ -1,184 +0,0 @@
-
-
-# Contribute a community pipeline
-
-
-
-💡 Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down.
-
-
-
-Community pipelines allow you to add any additional features you'd like on top of the [`DiffusionPipeline`]. The main benefit of building on top of the `DiffusionPipeline` is anyone can load and use your pipeline by only adding one more argument, making it super easy for the community to access.
-
-This guide will show you how to create a community pipeline and explain how they work. To keep things simple, you'll create a "one-step" pipeline where the `UNet` does a single forward pass and calls the scheduler once.
-
-## Initialize the pipeline
-
-You should start by creating a `one_step_unet.py` file for your community pipeline. In this file, create a pipeline class that inherits from the [`DiffusionPipeline`] to be able to load model weights and the scheduler configuration from the Hub. The one-step pipeline needs a `UNet` and a scheduler, so you'll need to add these as arguments to the `__init__` function:
-
-```python
-from diffusers import DiffusionPipeline
-import torch
-
-class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
- def __init__(self, unet, scheduler):
- super().__init__()
-```
-
-To ensure your pipeline and its components (`unet` and `scheduler`) can be saved with [`~DiffusionPipeline.save_pretrained`], add them to the `register_modules` function:
-
-```diff
- from diffusers import DiffusionPipeline
- import torch
-
- class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
- def __init__(self, unet, scheduler):
- super().__init__()
-
-+ self.register_modules(unet=unet, scheduler=scheduler)
-```
-
-Cool, the `__init__` step is done and you can move to the forward pass now! 🔥
-
-## Define the forward pass
-
-In the forward pass, which we recommend defining as `__call__`, you have complete creative freedom to add whatever feature you'd like. For our amazing one-step pipeline, create a random image and only call the `unet` and `scheduler` once by setting `timestep=1`:
-
-```diff
- from diffusers import DiffusionPipeline
- import torch
-
- class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
- def __init__(self, unet, scheduler):
- super().__init__()
-
- self.register_modules(unet=unet, scheduler=scheduler)
-
-+ def __call__(self):
-+ image = torch.randn(
-+ (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
-+ )
-+ timestep = 1
-
-+ model_output = self.unet(image, timestep).sample
-+ scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
-
-+ return scheduler_output
-```
-
-That's it! 🚀 You can now run this pipeline by passing a `unet` and `scheduler` to it:
-
-```python
-from diffusers import DDPMScheduler, UNet2DModel
-
-scheduler = DDPMScheduler()
-unet = UNet2DModel()
-
-pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
-
-output = pipeline()
-```
-
-But what's even better is you can load pre-existing weights into the pipeline if the pipeline structure is identical. For example, you can load the [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32) weights into the one-step pipeline:
-
-```python
-pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
-
-output = pipeline()
-```
-
-## Share your pipeline
-
-Open a Pull Request on the 🧨 Diffusers [repository](https://github.com/huggingface/diffusers) to add your awesome pipeline in `one_step_unet.py` to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder.
-
-Once it is merged, anyone with `diffusers >= 0.4.0` installed can use this pipeline magically 🪄 by specifying it in the `custom_pipeline` argument:
-
-```python
-from diffusers import DiffusionPipeline
-
-pipe = DiffusionPipeline.from_pretrained(
- "google/ddpm-cifar10-32", custom_pipeline="one_step_unet", use_safetensors=True
-)
-pipe()
-```
-
-Another way to share your community pipeline is to upload the `one_step_unet.py` file directly to your preferred [model repository](https://huggingface.co/docs/hub/models-uploading) on the Hub. Instead of specifying the `one_step_unet.py` file, pass the model repository id to the `custom_pipeline` argument:
-
-```python
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
- "google/ddpm-cifar10-32", custom_pipeline="stevhliu/one_step_unet", use_safetensors=True
-)
-```
-
-Take a look at the following table to compare the two sharing workflows to help you decide the best option for you:
-
-| | GitHub community pipeline | HF Hub community pipeline |
-|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
-| usage | same | same |
-| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow |
-| visibility | included in the official Diffusers repository and documentation | included on your HF Hub profile and relies on your own usage/promotion to gain visibility |
-
-
-
-💡 You can use whatever package you want in your community pipeline file - as long as the user has it installed, everything will work fine. Make sure you have one and only one pipeline class that inherits from `DiffusionPipeline` because this is automatically detected.
-
-
-
-## How do community pipelines work?
-
-A community pipeline is a class that inherits from [`DiffusionPipeline`] which means:
-
-- It can be loaded with the [`custom_pipeline`] argument.
-- The model weights and scheduler configuration are loaded from [`pretrained_model_name_or_path`].
-- The code that implements a feature in the community pipeline is defined in a `pipeline.py` file.
-
-Sometimes you can't load all the pipeline components weights from an official repository. In this case, the other components should be passed directly to the pipeline:
-
-```python
-from diffusers import DiffusionPipeline
-from transformers import CLIPImageProcessor, CLIPModel
-
-model_id = "CompVis/stable-diffusion-v1-4"
-clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
-
-feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id)
-clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16)
-
-pipeline = DiffusionPipeline.from_pretrained(
- model_id,
- custom_pipeline="clip_guided_stable_diffusion",
- clip_model=clip_model,
- feature_extractor=feature_extractor,
- scheduler=scheduler,
- torch_dtype=torch.float16,
- use_safetensors=True,
-)
-```
-
-The magic behind community pipelines is contained in the following code. It allows the community pipeline to be loaded from GitHub or the Hub, and it'll be available to all 🧨 Diffusers packages.
-
-```python
-# 2. Load the pipeline class, if using custom module then load it from the Hub
-# if we load from explicit class, let's use it
-if custom_pipeline is not None:
- pipeline_class = get_class_from_dynamic_module(
- custom_pipeline, module_file=CUSTOM_PIPELINE_FILE_NAME, cache_dir=custom_pipeline
- )
-elif cls != DiffusionPipeline:
- pipeline_class = cls
-else:
- diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
- pipeline_class = getattr(diffusers_module, config_dict["_class_name"])
-```
diff --git a/docs/source/en/using-diffusers/control_brightness.md b/docs/source/en/using-diffusers/control_brightness.md
deleted file mode 100644
index 5fad664f60f3..000000000000
--- a/docs/source/en/using-diffusers/control_brightness.md
+++ /dev/null
@@ -1,58 +0,0 @@
-
-
-# Control image brightness
-
-The Stable Diffusion pipeline is mediocre at generating images that are either very bright or dark as explained in the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) paper. The solutions proposed in the paper are currently implemented in the [`DDIMScheduler`] which you can use to improve the lighting in your images.
-
-
-
-💡 Take a look at the paper linked above for more details about the proposed solutions!
-
-
-
-One of the solutions is to train a model with *v prediction* and *v loss*. Add the following flag to the [`train_text_to_image.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [`train_text_to_image_lora.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts to enable `v_prediction`:
-
-```bash
---prediction_type="v_prediction"
-```
-
-For example, let's use the [`ptx0/pseudo-journey-v2`](https://huggingface.co/ptx0/pseudo-journey-v2) checkpoint which has been finetuned with `v_prediction`.
-
-Next, configure the following parameters in the [`DDIMScheduler`]:
-
-1. `rescale_betas_zero_snr=True`, rescales the noise schedule to zero terminal signal-to-noise ratio (SNR)
-2. `timestep_spacing="trailing"`, starts sampling from the last timestep
-
-```py
-from diffusers import DiffusionPipeline, DDIMScheduler
-
-pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
-
-# switch the scheduler in the pipeline to use the DDIMScheduler
-pipeline.scheduler = DDIMScheduler.from_config(
- pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
-)
-pipeline.to("cuda")
-```
-
-Finally, in your call to the pipeline, set `guidance_rescale` to prevent overexposure:
-
-```py
-prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"
-image = pipeline(prompt, guidance_rescale=0.7).images[0]
-image
-```
-
-
-
-
diff --git a/docs/source/en/using-diffusers/custom_pipeline_examples.md b/docs/source/en/using-diffusers/custom_pipeline_examples.md
deleted file mode 100644
index 203302ed3ead..000000000000
--- a/docs/source/en/using-diffusers/custom_pipeline_examples.md
+++ /dev/null
@@ -1,119 +0,0 @@
-
-
-# Community pipelines
-
-[[open-in-colab]]
-
-
-
-For more context about the design choices behind community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).
-
-
-
-Community pipelines allow you to get creative and build your own unique pipelines to share with the community. You can find all community pipelines in the [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) folder along with inference and training examples for how to use them. This guide showcases some of the community pipelines and hopefully it'll inspire you to create your own (feel free to open a PR with your own pipeline and we will merge it!).
-
-To load a community pipeline, use the `custom_pipeline` argument in [`DiffusionPipeline`] to specify one of the files in [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community):
-
-```py
-from diffusers import DiffusionPipeline
-
-pipe = DiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4", custom_pipeline="filename_in_the_community_folder", use_safetensors=True
-)
-```
-
-If a community pipeline doesn't work as expected, please open a GitHub issue and mention the author.
-
-You can learn more about community pipelines in the how to [load community pipelines](custom_pipeline_overview) and how to [contribute a community pipeline](contribute_pipeline) guides.
-
-## Multilingual Stable Diffusion
-
-The multilingual Stable Diffusion pipeline uses a pretrained [XLM-RoBERTa](https://huggingface.co/papluca/xlm-roberta-base-language-detection) to identify a language and the [mBART-large-50](https://huggingface.co/facebook/mbart-large-50-many-to-one-mmt) model to handle the translation. This allows you to generate images from text in 20 languages.
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-from diffusers.utils import make_image_grid
-from transformers import (
- pipeline,
- MBart50TokenizerFast,
- MBartForConditionalGeneration,
-)
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-device_dict = {"cuda": 0, "cpu": -1}
-
-# add language detection pipeline
-language_detection_model_ckpt = "papluca/xlm-roberta-base-language-detection"
-language_detection_pipeline = pipeline("text-classification",
- model=language_detection_model_ckpt,
- device=device_dict[device])
-
-# add model for language translation
-translation_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
-translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-one-mmt").to(device)
-
-diffuser_pipeline = DiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4",
- custom_pipeline="multilingual_stable_diffusion",
- detection_pipeline=language_detection_pipeline,
- translation_model=translation_model,
- translation_tokenizer=translation_tokenizer,
- torch_dtype=torch.float16,
-)
-
-diffuser_pipeline.enable_attention_slicing()
-diffuser_pipeline = diffuser_pipeline.to(device)
-
-prompt = ["a photograph of an astronaut riding a horse",
- "Una casa en la playa",
- "Ein Hund, der Orange isst",
- "Un restaurant parisien"]
-
-images = diffuser_pipeline(prompt).images
-make_image_grid(images, rows=2, cols=2)
-```
-
-
-
-
-
-## MagicMix
-
-[MagicMix](https://huggingface.co/papers/2210.16056) is a pipeline that can mix an image and text prompt to generate a new image that preserves the image structure. The `mix_factor` determines how much influence the prompt has on the layout generation, `kmin` controls the number of steps during the content generation process, and `kmax` determines how much information is kept in the layout of the original image.
-
-```py
-from diffusers import DiffusionPipeline, DDIMScheduler
-from diffusers.utils import load_image, make_image_grid
-
-pipeline = DiffusionPipeline.from_pretrained(
- "CompVis/stable-diffusion-v1-4",
- custom_pipeline="magic_mix",
- scheduler=DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler"),
-).to('cuda')
-
-img = load_image("https://user-images.githubusercontent.com/59410571/209578593-141467c7-d831-4792-8b9a-b17dc5e47816.jpg")
-mix_img = pipeline(img, prompt="bed", kmin=0.3, kmax=0.5, mix_factor=0.5)
-make_image_grid([img, mix_img], rows=1, cols=2)
-```
-
-
-
-
-
original image
-
-
-
-
image and text prompt mix
-
-
diff --git a/docs/source/en/using-diffusers/custom_pipeline_overview.md b/docs/source/en/using-diffusers/custom_pipeline_overview.md
index 0b6bb53f10d6..ef26e546e4d4 100644
--- a/docs/source/en/using-diffusers/custom_pipeline_overview.md
+++ b/docs/source/en/using-diffusers/custom_pipeline_overview.md
@@ -16,11 +16,19 @@ specific language governing permissions and limitations under the License.
## Community pipelines
+> [!TIP] Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down.
+
Community pipelines are any [`DiffusionPipeline`] class that are different from the original paper implementation (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline.
There are many cool community pipelines like [Marigold Depth Estimation](https://github.com/huggingface/diffusers/tree/main/examples/community#marigold-depth-estimation) or [InstantID](https://github.com/huggingface/diffusers/tree/main/examples/community#instantid-pipeline), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community).
-There are two types of community pipelines, those stored on the Hugging Face Hub and those stored on Diffusers GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while Diffusers GitHub pipelines are only limited to custom pipeline code. Refer to this [table](./contribute_pipeline#share-your-pipeline) for a more detailed comparison of Hub vs GitHub community pipelines.
+There are two types of community pipelines, those stored on the Hugging Face Hub and those stored on Diffusers GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while Diffusers GitHub pipelines are only limited to custom pipeline code.
+
+| | GitHub community pipeline | HF Hub community pipeline |
+|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
+| usage | same | same |
+| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow |
+| visibility | included in the official Diffusers repository and documentation | included on your HF Hub profile and relies on your own usage/promotion to gain visibility |
+## Deterministic batch generation
-Generate the images and have a look:
+A practical application of creating reproducible pipelines is *deterministic batch generation*. You generate a batch of images and select one image to improve with a more detailed prompt. The main idea is to pass a list of [Generator's](https://pytorch.org/docs/stable/generated/torch.Generator.html) to the pipeline and tie each `Generator` to a seed so you can reuse it.
-```python
-images = pipe(prompt, generator=generator, num_images_per_prompt=4).images
-make_image_grid(images, rows=2, cols=2)
+Let's use the [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint and generate a batch of images.
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+from diffusers.utils import make_image_grid
+
+pipeline = DiffusionPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
+)
+pipeline = pipeline.to("cuda")
```
-![img](https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds.jpg)
+Define four different `Generator`s and assign each `Generator` a seed (`0` to `3`). Then generate a batch of images and pick one to iterate on.
-In this example, you'll improve upon the first image - but in reality, you can use any image you want (even the image with double sets of eyes!). The first image used the `Generator` with seed `0`, so you'll reuse that `Generator` for the second round of inference. To improve the quality of the image, add some additional text to the prompt:
+> [!WARNING]
+> Use a list comprehension that iterates over the batch size specified in `range()` to create a unique `Generator` object for each image in the batch. If you multiply the `Generator` by the batch size integer, it only creates *one* `Generator` object that is used sequentially for each image in the batch.
+>
+> ```py
+> [torch.Generator().manual_seed(seed)] * 4
+> ```
```python
-prompt = [prompt + t for t in [", highly realistic", ", artsy", ", trending", ", colorful"]]
-generator = [torch.Generator(device="cuda").manual_seed(0) for i in range(4)]
+generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(4)]
+prompt = "Labrador in the style of Vermeer"
+images = pipeline(prompt, generator=generator, num_images_per_prompt=4).images[0]
+make_image_grid(images, rows=2, cols=2)
```
-Create four generators with seed `0`, and generate another batch of images, all of which should look like the first image from the previous round!
+
+
+
+
+Let's improve the first image (you can choose any image you want) which corresponds to the `Generator` with seed `0`. Add some additional text to your prompt and then make sure you reuse the same `Generator` with seed `0`. All the generated images should resemble the first image.
```python
-images = pipe(prompt, generator=generator).images
+prompt = [prompt + t for t in [", highly realistic", ", artsy", ", trending", ", colorful"]]
+generator = [torch.Generator(device="cuda").manual_seed(0) for i in range(4)]
+images = pipeline(prompt, generator=generator).images
make_image_grid(images, rows=2, cols=2)
```
-![img](https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds_2.jpg)
+
+
+
diff --git a/docs/source/ja/stable_diffusion.md b/docs/source/ja/stable_diffusion.md
index cae244dab2b6..b22178b75529 100644
--- a/docs/source/ja/stable_diffusion.md
+++ b/docs/source/ja/stable_diffusion.md
@@ -49,7 +49,7 @@ prompt = "portrait photo of a old warrior chief"
pipeline = pipeline.to("cuda")
```
-同じイメージを使って改良できるようにするには、[`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)を使い、[reproducibility](./using-diffusers/reproducibility)の種を設定します:
+同じイメージを使って改良できるようにするには、[`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)を使い、[reproducibility](./using-diffusers/reusing_seeds)の種を設定します:
```python
import torch
diff --git a/docs/source/ko/stable_diffusion.md b/docs/source/ko/stable_diffusion.md
index 678db4ff9eae..342f17bbff31 100644
--- a/docs/source/ko/stable_diffusion.md
+++ b/docs/source/ko/stable_diffusion.md
@@ -49,7 +49,7 @@ prompt = "portrait photo of a old warrior chief"
pipeline = pipeline.to("cuda")
```
-동일한 이미지를 사용하고 개선할 수 있는지 확인하려면 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)를 사용하고 [재현성](./using-diffusers/reproducibility)에 대한 시드를 설정하세요:
+동일한 이미지를 사용하고 개선할 수 있는지 확인하려면 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)를 사용하고 [재현성](./using-diffusers/reusing_seeds)에 대한 시드를 설정하세요:
```python
import torch
diff --git a/docs/source/ko/training/lora.md b/docs/source/ko/training/lora.md
index 5bb8a1e69be4..e9c690d80652 100644
--- a/docs/source/ko/training/lora.md
+++ b/docs/source/ko/training/lora.md
@@ -49,15 +49,15 @@ huggingface-cli login
### 학습[[dreambooth-training]]
-[Pokémon BLIP 캡션](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) 데이터셋으로 [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)를 파인튜닝해 나만의 포켓몬을 생성해 보겠습니다.
+[Naruto BLIP 캡션](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) 데이터셋으로 [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)를 파인튜닝해 나만의 포켓몬을 생성해 보겠습니다.
시작하려면 `MODEL_NAME` 및 `DATASET_NAME` 환경 변수가 설정되어 있는지 확인하십시오. `OUTPUT_DIR` 및 `HUB_MODEL_ID` 변수는 선택 사항이며 허브에서 모델을 저장할 위치를 지정합니다.
```bash
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export OUTPUT_DIR="/sddata/finetune/lora/pokemon"
-export HUB_MODEL_ID="pokemon-lora"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export OUTPUT_DIR="/sddata/finetune/lora/naruto"
+export HUB_MODEL_ID="naruto-lora"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
```
학습을 시작하기 전에 알아야 할 몇 가지 플래그가 있습니다.
diff --git a/docs/source/ko/training/text2image.md b/docs/source/ko/training/text2image.md
index f2ad3bb0719e..8a0463b497f4 100644
--- a/docs/source/ko/training/text2image.md
+++ b/docs/source/ko/training/text2image.md
@@ -73,12 +73,12 @@ xFormers는 Flax에 사용할 수 없습니다.
-다음과 같이 [Pokémon BLIP 캡션](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) 데이터셋에서 파인튜닝 실행을 위해 [PyTorch 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)를 실행합니다:
+다음과 같이 [Naruto BLIP 캡션](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) 데이터셋에서 파인튜닝 실행을 위해 [PyTorch 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)를 실행합니다:
```bash
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export dataset_name="lambdalabs/pokemon-blip-captions"
+export dataset_name="lambdalabs/naruto-blip-captions"
accelerate launch train_text_to_image.py \
--pretrained_model_name_or_path=$MODEL_NAME \
@@ -93,7 +93,7 @@ accelerate launch train_text_to_image.py \
--learning_rate=1e-05 \
--max_grad_norm=1 \
--lr_scheduler="constant" --lr_warmup_steps=0 \
- --output_dir="sd-pokemon-model"
+ --output_dir="sd-naruto-model"
```
자체 데이터셋으로 파인튜닝하려면 🤗 [Datasets](https://huggingface.co/docs/datasets/index)에서 요구하는 형식에 따라 데이터셋을 준비하세요. [데이터셋을 허브에 업로드](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub)하거나 [파일들이 있는 로컬 폴더를 준비](https ://huggingface.co/docs/datasets/image_dataset#imagefolder)할 수 있습니다.
@@ -136,7 +136,7 @@ pip install -U -r requirements_flax.txt
```bash
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export dataset_name="lambdalabs/pokemon-blip-captions"
+export dataset_name="lambdalabs/naruto-blip-captions"
python train_text_to_image_flax.py \
--pretrained_model_name_or_path=$MODEL_NAME \
@@ -146,7 +146,7 @@ python train_text_to_image_flax.py \
--max_train_steps=15000 \
--learning_rate=1e-05 \
--max_grad_norm=1 \
- --output_dir="sd-pokemon-model"
+ --output_dir="sd-naruto-model"
```
자체 데이터셋으로 파인튜닝하려면 🤗 [Datasets](https://huggingface.co/docs/datasets/index)에서 요구하는 형식에 따라 데이터셋을 준비하세요. [데이터셋을 허브에 업로드](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub)하거나 [파일들이 있는 로컬 폴더를 준비](https ://huggingface.co/docs/datasets/image_dataset#imagefolder)할 수 있습니다.
@@ -166,7 +166,7 @@ python train_text_to_image_flax.py \
--max_train_steps=15000 \
--learning_rate=1e-05 \
--max_grad_norm=1 \
- --output_dir="sd-pokemon-model"
+ --output_dir="sd-naruto-model"
```
@@ -189,7 +189,7 @@ pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.flo
pipe.to("cuda")
image = pipe(prompt="yoda").images[0]
-image.save("yoda-pokemon.png")
+image.save("yoda-naruto.png")
```
@@ -203,7 +203,7 @@ from diffusers import FlaxStableDiffusionPipeline
model_path = "path_to_saved_model"
pipe, params = FlaxStableDiffusionPipeline.from_pretrained(model_path, dtype=jax.numpy.bfloat16)
-prompt = "yoda pokemon"
+prompt = "yoda naruto"
prng_seed = jax.random.PRNGKey(0)
num_inference_steps = 50
@@ -218,7 +218,7 @@ prompt_ids = shard(prompt_ids)
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
-image.save("yoda-pokemon.png")
+image.save("yoda-naruto.png")
```
\ No newline at end of file
diff --git a/docs/source/ko/training/unconditional_training.md b/docs/source/ko/training/unconditional_training.md
index d0c200ef2daa..de9ae39a7a76 100644
--- a/docs/source/ko/training/unconditional_training.md
+++ b/docs/source/ko/training/unconditional_training.md
@@ -103,13 +103,13 @@ accelerate launch train_unconditional.py \
-[Pokemon](https://huggingface.co/datasets/huggan/pokemon) 데이터셋을 사용할 경우:
+[Naruto](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) 데이터셋을 사용할 경우:
```bash
accelerate launch train_unconditional.py \
- --dataset_name="huggan/pokemon" \
+ --dataset_name="lambdalabs/naruto-blip-captions" \
--resolution=64 \
- --output_dir="ddpm-ema-pokemon-64" \
+ --output_dir="ddpm-ema-naruto-64" \
--train_batch_size=16 \
--num_epochs=100 \
--gradient_accumulation_steps=1 \
@@ -129,9 +129,9 @@ accelerate launch train_unconditional.py \
```bash
accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \
- --dataset_name="huggan/pokemon" \
+ --dataset_name="lambdalabs/naruto-blip-captions" \
--resolution=64 --center_crop --random_flip \
- --output_dir="ddpm-ema-pokemon-64" \
+ --output_dir="ddpm-ema-naruto-64" \
--train_batch_size=16 \
--num_epochs=100 \
--gradient_accumulation_steps=1 \
diff --git a/docs/source/zh/stable_diffusion.md b/docs/source/zh/stable_diffusion.md
index 614d9505d7b8..d92cdf7d1163 100644
--- a/docs/source/zh/stable_diffusion.md
+++ b/docs/source/zh/stable_diffusion.md
@@ -51,7 +51,7 @@ prompt = "portrait photo of a old warrior chief"
pipeline = pipeline.to("cuda")
```
-为了确保您可以使用相同的图像并对其进行改进,使用 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) 方法,然后设置一个随机数种子 以确保其 [复现性](./using-diffusers/reproducibility):
+为了确保您可以使用相同的图像并对其进行改进,使用 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) 方法,然后设置一个随机数种子 以确保其 [复现性](./using-diffusers/reusing_seeds):
```python
import torch
diff --git a/examples/advanced_diffusion_training/README.md b/examples/advanced_diffusion_training/README.md
index fda73f9ce7a5..a13ae719cfdc 100644
--- a/examples/advanced_diffusion_training/README.md
+++ b/examples/advanced_diffusion_training/README.md
@@ -234,7 +234,7 @@ In ComfyUI we will load a LoRA and a textual embedding at the same time.
SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
### DoRA training
-The advanced script now supports DoRA training too!
+The advanced script supports DoRA training too!
> Proposed in [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353),
**DoRA** is very similar to LoRA, except it decomposes the pre-trained weight into two components, **magnitude** and **direction** and employs LoRA for _directional_ updates to efficiently minimize the number of trainable parameters.
The authors found that by using DoRA, both the learning capacity and training stability of LoRA are enhanced without any additional overhead during inference.
@@ -304,6 +304,147 @@ accelerate launch train_dreambooth_lora_sdxl_advanced.py \
> [!CAUTION]
> Min-SNR gamma is not supported with the EDM-style training yet. When training with the PlaygroundAI model, it's recommended to not pass any "variant".
+### B-LoRA training
+The advanced script now supports B-LoRA training too!
+> Proposed in [Implicit Style-Content Separation using B-LoRA](https://arxiv.org/abs/2403.14572),
+B-LoRA is a method that leverages LoRA to implicitly separate the style and content components of a **single** image.
+It was shown that learning the LoRA weights of two specific blocks (referred to as B-LoRAs)
+achieves style-content separation that cannot be achieved by training each B-LoRA independently.
+Once trained, the two B-LoRAs can be used as independent components to allow various image stylization tasks
+
+**Usage**
+Enable B-LoRA training by adding this flag
+```bash
+--use_blora
+```
+You can train a B-LoRA with as little as 1 image, and 1000 steps. Try this default configuration as a start:
+```bash
+!accelerate launch train_dreambooth_b-lora_sdxl.py \
+ --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" \
+ --instance_data_dir="linoyts/B-LoRA_teddy_bear" \
+ --output_dir="B-LoRA_teddy_bear" \
+ --instance_prompt="a [v18]" \
+ --resolution=1024 \
+ --rank=64 \
+ --train_batch_size=1 \
+ --learning_rate=5e-5 \
+ --lr_scheduler="constant" \
+ --lr_warmup_steps=0 \
+ --max_train_steps=1000 \
+ --checkpointing_steps=2000 \
+ --seed="0" \
+ --gradient_checkpointing \
+ --mixed_precision="fp16"
+```
+**Inference**
+The inference is a bit different:
+1. we need load *specific* unet layers (as opposed to a regular LoRA/DoRA)
+2. the trained layers we load, changes based on our objective (e.g. style/content)
+
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline, AutoencoderKL
+
+# taken & modified from B-LoRA repo - https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
+def is_belong_to_blocks(key, blocks):
+ try:
+ for g in blocks:
+ if g in key:
+ return True
+ return False
+ except Exception as e:
+ raise type(e)(f'failed to is_belong_to_block, due to: {e}')
+
+def lora_lora_unet_blocks(lora_path, alpha, target_blocks):
+ state_dict, _ = pipeline.lora_state_dict(lora_path)
+ filtered_state_dict = {k: v * alpha for k, v in state_dict.items() if is_belong_to_blocks(k, target_blocks)}
+ return filtered_state_dict
+
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+ "stabilityai/stable-diffusion-xl-base-1.0",
+ vae=vae,
+ torch_dtype=torch.float16,
+).to("cuda")
+
+# pick a blora for content/style (you can also set one to None)
+content_B_lora_path = "lora-library/B-LoRA-teddybear"
+style_B_lora_path= "lora-library/B-LoRA-pen_sketch"
+
+
+content_B_LoRA = lora_lora_unet_blocks(content_B_lora_path,alpha=1,target_blocks=["unet.up_blocks.0.attentions.0"])
+style_B_LoRA = lora_lora_unet_blocks(style_B_lora_path,alpha=1.1,target_blocks=["unet.up_blocks.0.attentions.1"])
+combined_lora = {**content_B_LoRA, **style_B_LoRA}
+
+# Load both loras
+pipeline.load_lora_into_unet(combined_lora, None, pipeline.unet)
+
+#generate
+prompt = "a [v18] in [v30] style"
+pipeline(prompt, num_images_per_prompt=4).images
+```
+### LoRA training of Targeted U-net Blocks
+The advanced script now supports custom choice of U-net blocks to train during Dreambooth LoRA tuning.
+> [!NOTE]
+> This feature is still experimental
+
+> Recently, works like B-LoRA showed the potential advantages of learning the LoRA weights of specific U-net blocks, not only in speed & memory,
+> but also in reducing the amount of needed data, improving style manipulation and overcoming overfitting issues.
+> In light of this, we're introducing a new feature to the advanced script to allow for configurable U-net learned blocks.
+
+**Usage**
+Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma seperated string specifying the targeted blocks.
+e.g:
+```bash
+--lora_unet_blocks="unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1"
+```
+
+> [!NOTE]
+> if you specify both `--use_blora` and `--lora_unet_blocks`, values given in --lora_unet_blocks will be ignored.
+> When enabling --use_blora, targeted U-net blocks are automatically set to be "unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1" as discussed in the paper.
+> If you wish to experiment with different blocks, specify `--lora_unet_blocks` only.
+
+**Inference**
+Inference is the same as for B-LoRAs, except the input targeted blocks should be modified based on your training configuration.
+```python
+import torch
+from diffusers import StableDiffusionXLPipeline, AutoencoderKL
+
+# taken & modified from B-LoRA repo - https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
+def is_belong_to_blocks(key, blocks):
+ try:
+ for g in blocks:
+ if g in key:
+ return True
+ return False
+ except Exception as e:
+ raise type(e)(f'failed to is_belong_to_block, due to: {e}')
+
+def lora_lora_unet_blocks(lora_path, alpha, target_blocks):
+ state_dict, _ = pipeline.lora_state_dict(lora_path)
+ filtered_state_dict = {k: v * alpha for k, v in state_dict.items() if is_belong_to_blocks(k, target_blocks)}
+ return filtered_state_dict
+
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+ "stabilityai/stable-diffusion-xl-base-1.0",
+ vae=vae,
+ torch_dtype=torch.float16,
+).to("cuda")
+
+lora_path = "lora-library/B-LoRA-pen_sketch"
+
+state_dict = lora_lora_unet_blocks(content_B_lora_path,alpha=1,target_blocks=["unet.up_blocks.0.attentions.0"])
+
+# Load traine dlora layers into the unet
+pipeline.load_lora_into_unet(state_dict, None, pipeline.unet)
+
+#generate
+prompt = "a dog in [v30] style"
+pipeline(prompt, num_images_per_prompt=4).images
+```
+
+
### Tips and Tricks
Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices)
diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
index 21a84b77245a..0699ac17077d 100644
--- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
+++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
@@ -15,7 +15,6 @@
import argparse
import gc
-import hashlib
import itertools
import json
import logging
@@ -40,6 +39,7 @@
from accelerate.logging import get_logger
from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
from huggingface_hub import create_repo, hf_hub_download, upload_folder
+from huggingface_hub.utils import insecure_hashlib
from packaging import version
from peft import LoraConfig, set_peft_model_state_dict
from peft.utils import get_peft_model_state_dict
@@ -696,6 +696,23 @@ def parse_args(input_args=None):
"Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
),
)
+ parser.add_argument(
+ "--lora_unet_blocks",
+ type=str,
+ default=None,
+ help=(
+ "the U-net blocks to tune during training. please specify them in a comma separated string, e.g. `unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1` etc."
+ "NOTE: By default (if not specified) - regular LoRA training is performed. "
+ "if --use_blora is enabled, this arg will be ignored, since in B-LoRA training, targeted U-net blocks are `unet.up_blocks.0.attentions.0` and `unet.up_blocks.0.attentions.1`"
+ ),
+ )
+ parser.add_argument(
+ "--use_blora",
+ action="store_true",
+ help=(
+ "Whether to train a B-LoRA as proposed in- Implicit Style-Content Separation using B-LoRA https://arxiv.org/abs/2403.14572. "
+ ),
+ )
parser.add_argument(
"--cache_latents",
action="store_true",
@@ -720,6 +737,11 @@ def parse_args(input_args=None):
"For full LoRA text encoder training check --train_text_encoder, for textual "
"inversion training check `--train_text_encoder_ti`"
)
+ if args.use_blora and args.lora_unet_blocks:
+ warnings.warn(
+ "You specified both `--use_blora` and `--lora_unet_blocks`, for B-LoRA training, target unet blocks are: `unet.up_blocks.0.attentions.0` and `unet.up_blocks.0.attentions.1`. "
+ "If you wish to target different U-net blocks, don't enable `--use_blora`"
+ )
env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
if env_local_rank != -1 and env_local_rank != args.local_rank:
@@ -740,6 +762,40 @@ def parse_args(input_args=None):
return args
+# Taken (and slightly modified) from B-LoRA repo https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py
+def is_belong_to_blocks(key, blocks):
+ try:
+ for g in blocks:
+ if g in key:
+ return True
+ return False
+ except Exception as e:
+ raise type(e)(f"failed to is_belong_to_block, due to: {e}")
+
+
+def get_unet_lora_target_modules(unet, use_blora, target_blocks=None):
+ if use_blora:
+ content_b_lora_blocks = "unet.up_blocks.0.attentions.0"
+ style_b_lora_blocks = "unet.up_blocks.0.attentions.1"
+ target_blocks = [content_b_lora_blocks, style_b_lora_blocks]
+ try:
+ blocks = [(".").join(blk.split(".")[1:]) for blk in target_blocks]
+
+ attns = [
+ attn_processor_name.rsplit(".", 1)[0]
+ for attn_processor_name, _ in unet.attn_processors.items()
+ if is_belong_to_blocks(attn_processor_name, blocks)
+ ]
+
+ target_modules = [f"{attn}.{mat}" for mat in ["to_k", "to_q", "to_v", "to_out.0"] for attn in attns]
+ return target_modules
+ except Exception as e:
+ raise type(e)(
+ f"failed to get_target_modules, due to: {e}. "
+ f"Please check the modules specified in --lora_unet_blocks are correct"
+ )
+
+
# Taken from https://github.com/replicate/cog-sdxl/blob/main/dataset_and_utils.py
class TokenEmbeddingsHandler:
def __init__(self, text_encoders, tokenizers):
@@ -946,16 +1002,20 @@ def __init__(
transforms.Normalize([0.5], [0.5]),
]
)
+ # if using B-LoRA for single image. do not use transformations
+ single_image = len(self.instance_images) < 2
for image in self.instance_images:
- image = exif_transpose(image)
+ if not single_image:
+ image = exif_transpose(image)
if not image.mode == "RGB":
image = image.convert("RGB")
self.original_sizes.append((image.height, image.width))
image = train_resize(image)
- if args.random_flip and random.random() < 0.5:
+
+ if not single_image and args.random_flip and random.random() < 0.5:
# flip
image = train_flip(image)
- if args.center_crop:
+ if args.center_crop or single_image:
y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
image = train_crop(image)
@@ -1216,7 +1276,7 @@ def main(args):
images = pipeline(example["prompt"]).images
for i, image in enumerate(images):
- hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+ hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
image.save(image_filename)
@@ -1374,12 +1434,24 @@ def main(args):
text_encoder_two.gradient_checkpointing_enable()
# now we will add new LoRA weights to the attention layers
+
+ if args.use_blora:
+ # if using B-LoRA, the targeted blocks to train are automatically set
+ target_modules = get_unet_lora_target_modules(unet, use_blora=True)
+ elif args.lora_unet_blocks:
+ # if training specific unet blocks not in the B-LoRA scheme
+ target_blocks_list = "".join(args.lora_unet_blocks.split()).split(",")
+ logger.info(f"list of unet blocks to train: {target_blocks_list}")
+ target_modules = get_unet_lora_target_modules(unet, use_blora=False, target_blocks=target_blocks_list)
+ else:
+ target_modules = ["to_k", "to_q", "to_v", "to_out.0"]
+
unet_lora_config = LoraConfig(
r=args.rank,
- lora_alpha=args.rank,
use_dora=args.use_dora,
+ lora_alpha=args.rank,
init_lora_weights="gaussian",
- target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+ target_modules=target_modules,
)
unet.add_adapter(unet_lora_config)
@@ -1388,8 +1460,8 @@ def main(args):
if args.train_text_encoder:
text_lora_config = LoraConfig(
r=args.rank,
- lora_alpha=args.rank,
use_dora=args.use_dora,
+ lora_alpha=args.rank,
init_lora_weights="gaussian",
target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
)
@@ -1505,6 +1577,7 @@ def load_model_hook(models, input_dir):
models = [unet_]
if args.train_text_encoder:
models.extend([text_encoder_one_, text_encoder_two_])
+ # only upcast trainable parameters (LoRA) into fp32
cast_training_params(models)
accelerator.register_save_state_pre_hook(save_model_hook)
@@ -1525,6 +1598,8 @@ def load_model_hook(models, input_dir):
models = [unet]
if args.train_text_encoder:
models.extend([text_encoder_one, text_encoder_two])
+
+ # only upcast trainable parameters (LoRA) into fp32
cast_training_params(models, dtype=torch.float32)
unet_lora_parameters = list(filter(lambda p: p.requires_grad, unet.parameters()))
@@ -1780,7 +1855,12 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
# We need to initialize the trackers we use, and also store our configuration.
# The trackers initializes automatically on the main process.
if accelerator.is_main_process:
- accelerator.init_trackers("dreambooth-lora-sd-xl", config=vars(args))
+ tracker_name = (
+ "dreambooth-lora-sd-xl"
+ if "playground" not in args.pretrained_model_name_or_path
+ else "dreambooth-lora-playground"
+ )
+ accelerator.init_trackers(tracker_name, config=vars(args))
# Train!
total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
@@ -1833,7 +1913,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
)
def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
- # TODO: revisit other sampling algorithms
sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
timesteps = timesteps.to(accelerator.device)
@@ -1852,6 +1931,7 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
# flag used for textual inversion
pivoted = False
for epoch in range(first_epoch, args.num_train_epochs):
+ unet.train()
# if performing any kind of optimization of text_encoder params
if args.train_text_encoder or args.train_text_encoder_ti:
if epoch == num_train_epochs_text_encoder:
@@ -1869,7 +1949,6 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
text_encoder_one.text_model.embeddings.requires_grad_(True)
text_encoder_two.text_model.embeddings.requires_grad_(True)
- unet.train()
for step, batch in enumerate(train_dataloader):
if pivoted:
# stopping optimization of text_encoder params
@@ -1970,7 +2049,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
timesteps,
prompt_embeds_input,
added_cond_kwargs=unet_added_conditions,
- ).sample
+ return_dict=False,
+ )[0]
else:
unet_added_conditions = {"time_ids": add_time_ids}
prompt_embeds, pooled_prompt_embeds = encode_prompt(
@@ -1988,7 +2068,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
timesteps,
prompt_embeds_input,
added_cond_kwargs=unet_added_conditions,
- ).sample
+ return_dict=False,
+ )[0]
weighting = None
if args.do_edm_style_training:
diff --git a/examples/community/checkpoint_merger.py b/examples/community/checkpoint_merger.py
index 9df5943a86b1..f702bf0cea9b 100644
--- a/examples/community/checkpoint_merger.py
+++ b/examples/community/checkpoint_merger.py
@@ -138,7 +138,6 @@ def merge(self, pretrained_model_name_or_path_list: List[Union[str, os.PathLike]
comparison_result &= self._compare_model_configs(config_dicts[idx - 1], config_dicts[idx])
if not force and comparison_result is False:
raise ValueError("Incompatible checkpoints. Please check model_index.json for the models.")
- print(config_dicts[0], config_dicts[1])
print("Compatible model_index.json files found")
# Step 2: Basic Validation has succeeded. Let's download the models and save them into our local files.
cached_folders = []
diff --git a/examples/community/clip_guided_stable_diffusion_img2img.py b/examples/community/clip_guided_stable_diffusion_img2img.py
index 434d5253679a..c8e0a9094f22 100644
--- a/examples/community/clip_guided_stable_diffusion_img2img.py
+++ b/examples/community/clip_guided_stable_diffusion_img2img.py
@@ -359,9 +359,16 @@ def __call__(
# Preprocess image
image = preprocess(image, width, height)
- latents = self.prepare_latents(
- image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, self.device, generator
- )
+ if latents is None:
+ latents = self.prepare_latents(
+ image,
+ latent_timestep,
+ batch_size,
+ num_images_per_prompt,
+ text_embeddings.dtype,
+ self.device,
+ generator,
+ )
if clip_guidance_scale > 0:
if clip_prompt is not None:
diff --git a/examples/community/latent_consistency_img2img.py b/examples/community/latent_consistency_img2img.py
index 35cd74166c68..3c5ffa845699 100644
--- a/examples/community/latent_consistency_img2img.py
+++ b/examples/community/latent_consistency_img2img.py
@@ -240,14 +240,6 @@ def prepare_latents(
return latents
- if latents is None:
- latents = torch.randn(shape, dtype=dtype).to(device)
- else:
- latents = latents.to(device)
- # scale the initial noise by the standard deviation required by the scheduler
- latents = latents * self.scheduler.init_noise_sigma
- return latents
-
def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
@@ -335,17 +327,18 @@ def __call__(
# 5. Prepare latent variable
num_channels_latents = self.unet.config.in_channels
- latents = self.prepare_latents(
- image,
- latent_timestep,
- batch_size * num_images_per_prompt,
- num_channels_latents,
- height,
- width,
- prompt_embeds.dtype,
- device,
- latents,
- )
+ if latents is None:
+ latents = self.prepare_latents(
+ image,
+ latent_timestep,
+ batch_size * num_images_per_prompt,
+ num_channels_latents,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ latents,
+ )
bs = batch_size * num_images_per_prompt
# 6. Get Guidance Scale Embedding
diff --git a/examples/community/pipeline_demofusion_sdxl.py b/examples/community/pipeline_demofusion_sdxl.py
index 93e1463638f0..f46d635dae2b 100644
--- a/examples/community/pipeline_demofusion_sdxl.py
+++ b/examples/community/pipeline_demofusion_sdxl.py
@@ -1304,7 +1304,11 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
if isinstance(component, torch.nn.Module):
if hasattr(component, "_hf_hook"):
is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
- is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+ is_sequential_cpu_offload = (
+ isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+ or hasattr(component._hf_hook, "hooks")
+ and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+ )
logger.info(
"Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
)
diff --git a/examples/community/pipeline_stable_diffusion_pag.py b/examples/community/pipeline_stable_diffusion_pag.py
index 04f38a888460..cdb7bd99cb29 100644
--- a/examples/community/pipeline_stable_diffusion_pag.py
+++ b/examples/community/pipeline_stable_diffusion_pag.py
@@ -1,4 +1,5 @@
-# Implementation of StableDiffusionPAGPipeline
+# Implementation of StableDiffusionPipeline with PAG
+# https://ku-cvlab.github.io/Perturbed-Attention-Guidance
import inspect
from typing import Any, Callable, Dict, List, Optional, Union
@@ -134,8 +135,8 @@ def __call__(
value = attn.to_v(hidden_states_ptb)
- hidden_states_ptb = torch.zeros(value.shape).to(value.get_device())
- # hidden_states_ptb = value
+ # hidden_states_ptb = torch.zeros(value.shape).to(value.get_device())
+ hidden_states_ptb = value
hidden_states_ptb = hidden_states_ptb.to(query.dtype)
@@ -1045,7 +1046,7 @@ def pag_scale(self):
return self._pag_scale
@property
- def do_adversarial_guidance(self):
+ def do_perturbed_attention_guidance(self):
return self._pag_scale > 0
@property
@@ -1056,14 +1057,6 @@ def pag_adaptive_scaling(self):
def do_pag_adaptive_scaling(self):
return self._pag_adaptive_scaling > 0
- @property
- def pag_drop_rate(self):
- return self._pag_drop_rate
-
- @property
- def pag_applied_layers(self):
- return self._pag_applied_layers
-
@property
def pag_applied_layers_index(self):
return self._pag_applied_layers_index
@@ -1080,8 +1073,6 @@ def __call__(
guidance_scale: float = 7.5,
pag_scale: float = 0.0,
pag_adaptive_scaling: float = 0.0,
- pag_drop_rate: float = 0.5,
- pag_applied_layers: List[str] = ["down"], # ['down', 'mid', 'up']
pag_applied_layers_index: List[str] = ["d4"], # ['d4', 'd5', 'm0']
negative_prompt: Optional[Union[str, List[str]]] = None,
num_images_per_prompt: Optional[int] = 1,
@@ -1221,8 +1212,6 @@ def __call__(
self._pag_scale = pag_scale
self._pag_adaptive_scaling = pag_adaptive_scaling
- self._pag_drop_rate = pag_drop_rate
- self._pag_applied_layers = pag_applied_layers
self._pag_applied_layers_index = pag_applied_layers_index
# 2. Define call parameters
@@ -1257,13 +1246,13 @@ def __call__(
# to avoid doing two forward passes
# cfg
- if self.do_classifier_free_guidance and not self.do_adversarial_guidance:
+ if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance:
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
# pag
- elif not self.do_classifier_free_guidance and self.do_adversarial_guidance:
+ elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
prompt_embeds = torch.cat([prompt_embeds, prompt_embeds])
# both
- elif self.do_classifier_free_guidance and self.do_adversarial_guidance:
+ elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds, prompt_embeds])
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
@@ -1306,7 +1295,7 @@ def __call__(
).to(device=device, dtype=latents.dtype)
# 7. Denoising loop
- if self.do_adversarial_guidance:
+ if self.do_perturbed_attention_guidance:
down_layers = []
mid_layers = []
up_layers = []
@@ -1322,6 +1311,29 @@ def __call__(
else:
raise ValueError(f"Invalid layer type: {layer_type}")
+ # change attention layer in UNet if use PAG
+ if self.do_perturbed_attention_guidance:
+ if self.do_classifier_free_guidance:
+ replace_processor = PAGCFGIdentitySelfAttnProcessor()
+ else:
+ replace_processor = PAGIdentitySelfAttnProcessor()
+
+ drop_layers = self.pag_applied_layers_index
+ for drop_layer in drop_layers:
+ try:
+ if drop_layer[0] == "d":
+ down_layers[int(drop_layer[1])].processor = replace_processor
+ elif drop_layer[0] == "m":
+ mid_layers[int(drop_layer[1])].processor = replace_processor
+ elif drop_layer[0] == "u":
+ up_layers[int(drop_layer[1])].processor = replace_processor
+ else:
+ raise ValueError(f"Invalid layer type: {drop_layer[0]}")
+ except IndexError:
+ raise ValueError(
+ f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
+ )
+
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
self._num_timesteps = len(timesteps)
with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -1330,41 +1342,18 @@ def __call__(
continue
# cfg
- if self.do_classifier_free_guidance and not self.do_adversarial_guidance:
+ if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance:
latent_model_input = torch.cat([latents] * 2)
# pag
- elif not self.do_classifier_free_guidance and self.do_adversarial_guidance:
+ elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
latent_model_input = torch.cat([latents] * 2)
# both
- elif self.do_classifier_free_guidance and self.do_adversarial_guidance:
+ elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
latent_model_input = torch.cat([latents] * 3)
# no
else:
latent_model_input = latents
- # change attention layer in UNet if use PAG
- if self.do_adversarial_guidance:
- if self.do_classifier_free_guidance:
- replace_processor = PAGCFGIdentitySelfAttnProcessor()
- else:
- replace_processor = PAGIdentitySelfAttnProcessor()
-
- drop_layers = self.pag_applied_layers_index
- for drop_layer in drop_layers:
- try:
- if drop_layer[0] == "d":
- down_layers[int(drop_layer[1])].processor = replace_processor
- elif drop_layer[0] == "m":
- mid_layers[int(drop_layer[1])].processor = replace_processor
- elif drop_layer[0] == "u":
- up_layers[int(drop_layer[1])].processor = replace_processor
- else:
- raise ValueError(f"Invalid layer type: {drop_layer[0]}")
- except IndexError:
- raise ValueError(
- f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
- )
-
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
@@ -1381,14 +1370,14 @@ def __call__(
# perform guidance
# cfg
- if self.do_classifier_free_guidance and not self.do_adversarial_guidance:
+ if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
delta = noise_pred_text - noise_pred_uncond
noise_pred = noise_pred_uncond + self.guidance_scale * delta
# pag
- elif not self.do_classifier_free_guidance and self.do_adversarial_guidance:
+ elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
noise_pred_original, noise_pred_perturb = noise_pred.chunk(2)
signal_scale = self.pag_scale
@@ -1400,7 +1389,7 @@ def __call__(
noise_pred = noise_pred_original + signal_scale * (noise_pred_original - noise_pred_perturb)
# both
- elif self.do_classifier_free_guidance and self.do_adversarial_guidance:
+ elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance:
noise_pred_uncond, noise_pred_text, noise_pred_text_perturb = noise_pred.chunk(3)
signal_scale = self.pag_scale
@@ -1458,11 +1447,8 @@ def __call__(
# Offload all models
self.maybe_free_model_hooks()
- if not return_dict:
- return (image, has_nsfw_concept)
-
# change attention layer in UNet if use PAG
- if self.do_adversarial_guidance:
+ if self.do_perturbed_attention_guidance:
drop_layers = self.pag_applied_layers_index
for drop_layer in drop_layers:
try:
@@ -1479,4 +1465,7 @@ def __call__(
f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
)
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/examples/community/stable_diffusion_controlnet_img2img.py b/examples/community/stable_diffusion_controlnet_img2img.py
index 5f9083616a84..74674e65f0ef 100644
--- a/examples/community/stable_diffusion_controlnet_img2img.py
+++ b/examples/community/stable_diffusion_controlnet_img2img.py
@@ -802,15 +802,16 @@ def __call__(
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
# 6. Prepare latent variables
- latents = self.prepare_latents(
- image,
- latent_timestep,
- batch_size,
- num_images_per_prompt,
- prompt_embeds.dtype,
- device,
- generator,
- )
+ if latents is None:
+ latents = self.prepare_latents(
+ image,
+ latent_timestep,
+ batch_size,
+ num_images_per_prompt,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
diff --git a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
index d056eb112165..14c4e4aa6d4e 100644
--- a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
+++ b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
@@ -907,15 +907,16 @@ def __call__(
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
# 6. Prepare latent variables
- latents = self.prepare_latents(
- image,
- latent_timestep,
- batch_size,
- num_images_per_prompt,
- prompt_embeds.dtype,
- device,
- generator,
- )
+ if latents is None:
+ latents = self.prepare_latents(
+ image,
+ latent_timestep,
+ batch_size,
+ num_images_per_prompt,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ )
mask_image_latents = self.prepare_mask_latents(
mask_image,
diff --git a/examples/consistency_distillation/README_sdxl.md b/examples/consistency_distillation/README_sdxl.md
index d3abaa4ce175..6bd84727cf31 100644
--- a/examples/consistency_distillation/README_sdxl.md
+++ b/examples/consistency_distillation/README_sdxl.md
@@ -115,11 +115,11 @@ accelerate launch train_lcm_distill_lora_sdxl_wds.py \
We provide another version for LCM LoRA SDXL that follows best practices of `peft` and leverages the `datasets` library for quick experimentation. The script doesn't load two UNets unlike `train_lcm_distill_lora_sdxl_wds.py` which reduces the memory requirements quite a bit.
-Below is an example training command that trains an LCM LoRA on the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions):
+Below is an example training command that trains an LCM LoRA on the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions):
```bash
export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
export VAE_PATH="madebyollin/sdxl-vae-fp16-fix"
accelerate launch train_lcm_distill_lora_sdxl.py \
diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py b/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
index 9405c238f937..56f83f47b84c 100644
--- a/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
+++ b/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
@@ -71,7 +71,7 @@
logger = get_logger(__name__)
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/examples/controlnet/train_controlnet_sdxl.py b/examples/controlnet/train_controlnet_sdxl.py
index 62192521a323..288a1e3fb612 100644
--- a/examples/controlnet/train_controlnet_sdxl.py
+++ b/examples/controlnet/train_controlnet_sdxl.py
@@ -32,7 +32,7 @@
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
-from accelerate.utils import ProjectConfiguration, set_seed
+from accelerate.utils import DistributedType, ProjectConfiguration, set_seed
from datasets import load_dataset
from huggingface_hub import create_repo, upload_folder
from packaging import version
@@ -53,7 +53,7 @@
from diffusers.optimization import get_scheduler
from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
-from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
from diffusers.utils.torch_utils import is_compiled_module
@@ -64,6 +64,8 @@
check_min_version("0.28.0.dev0")
logger = get_logger(__name__)
+if is_torch_npu_available():
+ torch.npu.config.allow_internal_format = False
def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step, is_final_validation=False):
@@ -471,6 +473,9 @@ def parse_args(input_args=None):
parser.add_argument(
"--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
)
+ parser.add_argument(
+ "--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention."
+ )
parser.add_argument(
"--set_grads_to_none",
action="store_true",
@@ -936,6 +941,13 @@ def load_model_hook(models, input_dir):
text_encoder_two.requires_grad_(False)
controlnet.train()
+ if args.enable_npu_flash_attention:
+ if is_torch_npu_available():
+ logger.info("npu flash attention enabled.")
+ unet.enable_npu_flash_attention()
+ else:
+ raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.")
+
if args.enable_xformers_memory_efficient_attention:
if is_xformers_available():
import xformers
@@ -1235,7 +1247,8 @@ def compute_embeddings(batch, proportion_empty_prompts, text_encoders, tokenizer
progress_bar.update(1)
global_step += 1
- if accelerator.is_main_process:
+ # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
+ if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
if global_step % args.checkpointing_steps == 0:
# _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
if args.checkpoints_total_limit is not None:
diff --git a/examples/kandinsky2_2/text_to_image/README.md b/examples/kandinsky2_2/text_to_image/README.md
index 6e5a1835593f..d27ba1a21f0c 100644
--- a/examples/kandinsky2_2/text_to_image/README.md
+++ b/examples/kandinsky2_2/text_to_image/README.md
@@ -57,7 +57,7 @@ To disable wandb logging, remove the `--report_to=="wandb"` and `--validation_pr
```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch --mixed_precision="fp16" train_text_to_image_decoder.py \
--dataset_name=$DATASET_NAME \
@@ -139,7 +139,7 @@ You can fine-tune the Kandinsky prior model with `train_text_to_image_prior.py`
```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch --mixed_precision="fp16" train_text_to_image_prior.py \
--dataset_name=$DATASET_NAME \
@@ -183,7 +183,7 @@ If you want to use a fine-tuned decoder checkpoint along with your fine-tuned pr
for running distributed training with `accelerate`. Here is an example command:
```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch --mixed_precision="fp16" --multi_gpu train_text_to_image_decoder.py \
--dataset_name=$DATASET_NAME \
@@ -227,13 +227,13 @@ on consumer GPUs like Tesla T4, Tesla V100.
### Training
-First, you need to set up your development environment as explained in the [installation](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).
+First, you need to set up your development environment as explained in the [installation](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).
#### Train decoder
```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch --mixed_precision="fp16" train_text_to_image_decoder_lora.py \
--dataset_name=$DATASET_NAME --caption_column="text" \
@@ -252,7 +252,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_decoder_lora.py \
#### Train prior
```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch --mixed_precision="fp16" train_text_to_image_prior_lora.py \
--dataset_name=$DATASET_NAME --caption_column="text" \
diff --git a/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py b/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py
index e169cf92beb9..f6f3896aaa12 100644
--- a/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py
+++ b/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py
@@ -332,7 +332,7 @@ def parse_args():
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py b/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py
index bd95aed2939c..54a4d0a397b4 100644
--- a/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py
+++ b/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py
@@ -56,7 +56,7 @@
logger = get_logger(__name__, log_level="INFO")
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/examples/research_projects/lora/README.md b/examples/research_projects/lora/README.md
index b5d72403166f..14cd6cd9be56 100644
--- a/examples/research_projects/lora/README.md
+++ b/examples/research_projects/lora/README.md
@@ -19,7 +19,7 @@ on consumer GPUs like Tesla T4, Tesla V100.
### Training
-First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).
+First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).
**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
@@ -27,7 +27,7 @@ First, you need to set up your development environment as is explained in the [i
```bash
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
```
For this example we want to directly store the trained LoRA embeddings on the Hub, so
diff --git a/examples/research_projects/lora/train_text_to_image_lora.py b/examples/research_projects/lora/train_text_to_image_lora.py
index cf00bf270057..1ebc1422b064 100644
--- a/examples/research_projects/lora/train_text_to_image_lora.py
+++ b/examples/research_projects/lora/train_text_to_image_lora.py
@@ -387,7 +387,7 @@ def parse_args():
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/examples/research_projects/onnxruntime/text_to_image/README.md b/examples/research_projects/onnxruntime/text_to_image/README.md
index 48bce2065444..8b499795746c 100644
--- a/examples/research_projects/onnxruntime/text_to_image/README.md
+++ b/examples/research_projects/onnxruntime/text_to_image/README.md
@@ -55,7 +55,7 @@ The command to train a DDPM UNetCondition model on the Pokemon dataset with onnx
```bash
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export dataset_name="lambdalabs/pokemon-blip-captions"
+export dataset_name="lambdalabs/naruto-blip-captions"
accelerate launch --mixed_precision="fp16" train_text_to_image.py \
--pretrained_model_name_or_path=$MODEL_NAME \
--dataset_name=$dataset_name \
diff --git a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
index ee61f033d34d..126a10b4f9e9 100644
--- a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
+++ b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
@@ -59,7 +59,7 @@
logger = get_logger(__name__, log_level="INFO")
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py
index 0f4cc6c50b5e..d3bf95305dad 100644
--- a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py
+++ b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py
@@ -61,7 +61,7 @@
logger = get_logger(__name__, log_level="INFO")
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py
index f22519b02e2b..a4b4d69bb892 100644
--- a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py
+++ b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py
@@ -406,7 +406,7 @@ def parse_args():
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py
index e5ff9d39e8ba..d7f2dcaa3442 100644
--- a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py
+++ b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py
@@ -468,7 +468,7 @@ def parse_args(input_args=None):
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py
index 1dac573fce4c..a056bcfc8cb1 100644
--- a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py
+++ b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py
@@ -60,7 +60,7 @@
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md
index f2931d3f347e..9a8410604878 100644
--- a/examples/text_to_image/README.md
+++ b/examples/text_to_image/README.md
@@ -57,7 +57,7 @@ With `gradient_checkpointing` and `mixed_precision` it should be possible to fin
```bash
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch --mixed_precision="fp16" train_text_to_image.py \
--pretrained_model_name_or_path=$MODEL_NAME \
@@ -136,7 +136,7 @@ for running distributed training with `accelerate`. Here is an example command:
```bash
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch --mixed_precision="fp16" --multi_gpu train_text_to_image.py \
--pretrained_model_name_or_path=$MODEL_NAME \
@@ -170,6 +170,11 @@ For our small Pokemons dataset, the effects of Min-SNR weighting strategy might
Also, note that in this example, we either predict `epsilon` (i.e., the noise) or the `v_prediction`. For both of these cases, the formulation of the Min-SNR weighting strategy that we have used holds.
+#### Training with DREAM
+
+We support training epsilon (noise) prediction models using the [DREAM (Diffusion Rectification and Estimation-Adaptive Models) strategy](https://arxiv.org/abs/2312.00210). DREAM claims to increase model fidelity for the performance cost of an extra grad-less unet `forward` step in the training loop. You can turn on DREAM training by using the `--dream_training` argument. The `--dream_detail_preservation` argument controls the detail preservation variable p and is the default of 1 from the paper.
+
+
## Training with LoRA
Low-Rank Adaption of Large Language Models was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*.
@@ -187,7 +192,7 @@ on consumer GPUs like Tesla T4, Tesla V100.
### Training
-First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).
+First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).
**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
@@ -195,7 +200,7 @@ First, you need to set up your development environment as is explained in the [i
```bash
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
```
For this example we want to directly store the trained LoRA embeddings on the Hub, so
@@ -277,7 +282,7 @@ pip install -U -r requirements_flax.txt
```bash
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
python train_text_to_image_flax.py \
--pretrained_model_name_or_path=$MODEL_NAME \
diff --git a/examples/text_to_image/README_sdxl.md b/examples/text_to_image/README_sdxl.md
index 349feef5008e..35ea0091c4f3 100644
--- a/examples/text_to_image/README_sdxl.md
+++ b/examples/text_to_image/README_sdxl.md
@@ -52,7 +52,7 @@ Note also that we use PEFT library as backend for LoRA training, make sure to ha
```bash
export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch train_text_to_image_sdxl.py \
--pretrained_model_name_or_path=$MODEL_NAME \
@@ -76,7 +76,7 @@ accelerate launch train_text_to_image_sdxl.py \
**Notes**:
-* The `train_text_to_image_sdxl.py` script pre-computes text embeddings and the VAE encodings and keeps them in memory. While for smaller datasets like [`lambdalabs/pokemon-blip-captions`](https://hf.co/datasets/lambdalabs/pokemon-blip-captions), it might not be a problem, it can definitely lead to memory problems when the script is used on a larger dataset. For those purposes, you would want to serialize these pre-computed representations to disk separately and load them during the fine-tuning process. Refer to [this PR](https://github.com/huggingface/diffusers/pull/4505) for a more in-depth discussion.
+* The `train_text_to_image_sdxl.py` script pre-computes text embeddings and the VAE encodings and keeps them in memory. While for smaller datasets like [`lambdalabs/naruto-blip-captions`](https://hf.co/datasets/lambdalabs/naruto-blip-captions), it might not be a problem, it can definitely lead to memory problems when the script is used on a larger dataset. For those purposes, you would want to serialize these pre-computed representations to disk separately and load them during the fine-tuning process. Refer to [this PR](https://github.com/huggingface/diffusers/pull/4505) for a more in-depth discussion.
* The training script is compute-intensive and may not run on a consumer GPU like Tesla T4.
* The training command shown above performs intermediate quality validation in between the training epochs and logs the results to Weights and Biases. `--report_to`, `--validation_prompt`, and `--validation_epochs` are the relevant CLI arguments here.
* SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
@@ -142,14 +142,14 @@ on consumer GPUs like Tesla T4, Tesla V100.
### Training
-First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables and, optionally, the `VAE_NAME` variable. Here, we will use [Stable Diffusion XL 1.0-base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).
+First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables and, optionally, the `VAE_NAME` variable. Here, we will use [Stable Diffusion XL 1.0-base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).
**___Note: It is quite useful to monitor the training progress by regularly generating sample images during training. [Weights and Biases](https://docs.wandb.ai/quickstart) is a nice solution to easily see generating images during training. All you need to do is to run `pip install wandb` before training to automatically log images.___**
```bash
export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
```
For this example we want to directly store the trained LoRA embeddings on the Hub, so
@@ -219,7 +219,7 @@ You need to save the mentioned configuration as an `accelerate_config.yaml` file
```shell
export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
export ACCELERATE_CONFIG_FILE="your accelerate_config.yaml"
accelerate launch --config_file $ACCELERATE_CONFIG_FILE train_text_to_image_lora_sdxl.py \
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index 84f4c6514cfd..13ee0f2cc4c7 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -45,7 +45,7 @@
import diffusers
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
from diffusers.optimization import get_scheduler
-from diffusers.training_utils import EMAModel, compute_snr
+from diffusers.training_utils import EMAModel, compute_dream_and_update_latents, compute_snr
from diffusers.utils import check_min_version, deprecate, is_wandb_available, make_image_grid
from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
from diffusers.utils.import_utils import is_xformers_available
@@ -62,7 +62,7 @@
logger = get_logger(__name__, log_level="INFO")
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
@@ -361,6 +361,20 @@ def parse_args():
help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
"More details here: https://arxiv.org/abs/2303.09556.",
)
+ parser.add_argument(
+ "--dream_training",
+ action="store_true",
+ help=(
+ "Use the DREAM training method, which makes training more efficient and accurate at the ",
+ "expense of doing an extra forward pass. See: https://arxiv.org/abs/2312.00210",
+ ),
+ )
+ parser.add_argument(
+ "--dream_detail_preservation",
+ type=float,
+ default=1.0,
+ help="Dream detail preservation factor p (should be greater than 0; default=1.0, as suggested in the paper)",
+ )
parser.add_argument(
"--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
)
@@ -948,6 +962,18 @@ def unwrap_model(model):
else:
raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+ if args.dream_training:
+ noisy_latents, target = compute_dream_and_update_latents(
+ unet,
+ noise_scheduler,
+ timesteps,
+ noise,
+ noisy_latents,
+ target,
+ encoder_hidden_states,
+ args.dream_detail_preservation,
+ )
+
# Predict the noise residual and compute loss
model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py
index 557923c52e00..c3a08a90b4e5 100644
--- a/examples/text_to_image/train_text_to_image_flax.py
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -250,7 +250,7 @@ def parse_args():
dataset_name_mapping = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py
index 7164ac909cb2..37b10cfd1bad 100644
--- a/examples/text_to_image/train_text_to_image_lora.py
+++ b/examples/text_to_image/train_text_to_image_lora.py
@@ -387,7 +387,7 @@ def parse_args():
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py
index 0a6a70de2dc7..c9883252d14b 100644
--- a/examples/text_to_image/train_text_to_image_lora_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py
@@ -32,7 +32,7 @@
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
-from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from accelerate.utils import DistributedDataParallelKwargs, DistributedType, ProjectConfiguration, set_seed
from datasets import load_dataset
from huggingface_hub import create_repo, upload_folder
from packaging import version
@@ -60,7 +60,7 @@
is_wandb_available,
)
from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
-from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
from diffusers.utils.torch_utils import is_compiled_module
@@ -68,6 +68,8 @@
check_min_version("0.28.0.dev0")
logger = get_logger(__name__)
+if is_torch_npu_available():
+ torch.npu.config.allow_internal_format = False
def save_model_card(
@@ -419,6 +421,9 @@ def parse_args(input_args=None):
parser.add_argument(
"--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
)
+ parser.add_argument(
+ "--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention."
+ )
parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
parser.add_argument(
"--rank",
@@ -449,7 +454,7 @@ def parse_args(input_args=None):
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
@@ -623,6 +628,13 @@ def main(args):
text_encoder_one.to(accelerator.device, dtype=weight_dtype)
text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+ if args.enable_npu_flash_attention:
+ if is_torch_npu_available():
+ logger.info("npu flash attention enabled.")
+ unet.enable_npu_flash_attention()
+ else:
+ raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.")
+
if args.enable_xformers_memory_efficient_attention:
if is_xformers_available():
import xformers
@@ -1149,7 +1161,8 @@ def compute_time_ids(original_size, crops_coords_top_left):
accelerator.log({"train_loss": train_loss}, step=global_step)
train_loss = 0.0
- if accelerator.is_main_process:
+ # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
+ if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
if global_step % args.checkpointing_steps == 0:
# _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
if args.checkpoints_total_limit is not None:
diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py
index 88adbb995531..90602ad597a9 100644
--- a/examples/text_to_image/train_text_to_image_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_sdxl.py
@@ -61,7 +61,7 @@
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/examples/wuerstchen/text_to_image/README.md b/examples/wuerstchen/text_to_image/README.md
index d655259088e4..7583296e66d1 100644
--- a/examples/wuerstchen/text_to_image/README.md
+++ b/examples/wuerstchen/text_to_image/README.md
@@ -37,7 +37,7 @@ You can fine-tune the Würstchen prior model with the `train_text_to_image_prior
```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch train_text_to_image_prior.py \
--mixed_precision="fp16" \
@@ -72,10 +72,10 @@ In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-de
### Prior Training
-First, you need to set up your development environment as explained in the [installation](#Running-locally-with-PyTorch) section. Make sure to set the `DATASET_NAME` environment variable. Here, we will use the [Pokemon captions dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).
+First, you need to set up your development environment as explained in the [installation](#Running-locally-with-PyTorch) section. Make sure to set the `DATASET_NAME` environment variable. Here, we will use the [Pokemon captions dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions).
```bash
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
accelerate launch train_text_to_image_lora_prior.py \
--mixed_precision="fp16" \
diff --git a/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py b/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py
index 76eaf6423960..79f7d8576ff4 100644
--- a/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py
+++ b/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py
@@ -55,7 +55,7 @@
logger = get_logger(__name__, log_level="INFO")
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/examples/wuerstchen/text_to_image/train_text_to_image_prior.py b/examples/wuerstchen/text_to_image/train_text_to_image_prior.py
index 49cc5d26072d..3e0acfdaf519 100644
--- a/examples/wuerstchen/text_to_image/train_text_to_image_prior.py
+++ b/examples/wuerstchen/text_to_image/train_text_to_image_prior.py
@@ -56,7 +56,7 @@
logger = get_logger(__name__, log_level="INFO")
DATASET_NAME_MAPPING = {
- "lambdalabs/pokemon-blip-captions": ("image", "text"),
+ "lambdalabs/naruto-blip-captions": ("image", "text"),
}
diff --git a/scripts/convert_animatediff_motion_lora_to_diffusers.py b/scripts/convert_animatediff_motion_lora_to_diffusers.py
index 509a7345793c..c680fdc68462 100644
--- a/scripts/convert_animatediff_motion_lora_to_diffusers.py
+++ b/scripts/convert_animatediff_motion_lora_to_diffusers.py
@@ -1,7 +1,7 @@
import argparse
import torch
-from safetensors.torch import save_file
+from safetensors.torch import load_file, save_file
def convert_motion_module(original_state_dict):
@@ -34,7 +34,10 @@ def get_args():
if __name__ == "__main__":
args = get_args()
- state_dict = torch.load(args.ckpt_path, map_location="cpu")
+ if args.ckpt_path.endswith(".safetensors"):
+ state_dict = load_file(args.ckpt_path)
+ else:
+ state_dict = torch.load(args.ckpt_path, map_location="cpu")
if "state_dict" in state_dict.keys():
state_dict = state_dict["state_dict"]
diff --git a/scripts/convert_animatediff_motion_module_to_diffusers.py b/scripts/convert_animatediff_motion_module_to_diffusers.py
index ceb967acd3d6..e188a6a533e8 100644
--- a/scripts/convert_animatediff_motion_module_to_diffusers.py
+++ b/scripts/convert_animatediff_motion_module_to_diffusers.py
@@ -1,6 +1,7 @@
import argparse
import torch
+from safetensors.torch import load_file
from diffusers import MotionAdapter
@@ -30,6 +31,7 @@ def get_args():
parser.add_argument("--output_path", type=str, required=True)
parser.add_argument("--use_motion_mid_block", action="store_true")
parser.add_argument("--motion_max_seq_length", type=int, default=32)
+ parser.add_argument("--block_out_channels", nargs="+", default=[320, 640, 1280, 1280], type=int)
parser.add_argument("--save_fp16", action="store_true")
return parser.parse_args()
@@ -38,17 +40,23 @@ def get_args():
if __name__ == "__main__":
args = get_args()
- state_dict = torch.load(args.ckpt_path, map_location="cpu")
+ if args.ckpt_path.endswith(".safetensors"):
+ state_dict = load_file(args.ckpt_path)
+ else:
+ state_dict = torch.load(args.ckpt_path, map_location="cpu")
+
if "state_dict" in state_dict.keys():
state_dict = state_dict["state_dict"]
conv_state_dict = convert_motion_module(state_dict)
adapter = MotionAdapter(
- use_motion_mid_block=args.use_motion_mid_block, motion_max_seq_length=args.motion_max_seq_length
+ block_out_channels=args.block_out_channels,
+ use_motion_mid_block=args.use_motion_mid_block,
+ motion_max_seq_length=args.motion_max_seq_length,
)
# skip loading position embeddings
adapter.load_state_dict(conv_state_dict, strict=False)
adapter.save_pretrained(args.output_path)
if args.save_fp16:
- adapter.to(torch.float16).save_pretrained(args.output_path, variant="fp16")
+ adapter.to(dtype=torch.float16).save_pretrained(args.output_path, variant="fp16")
diff --git a/scripts/convert_zero123_to_diffusers.py b/scripts/convert_zero123_to_diffusers.py
index 669a4962be3c..b46633fae7ff 100644
--- a/scripts/convert_zero123_to_diffusers.py
+++ b/scripts/convert_zero123_to_diffusers.py
@@ -113,7 +113,7 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
assert "adm_in_channels" in unet_params
projection_class_embeddings_input_dim = unet_params["adm_in_channels"]
else:
- raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params["num_classes"]}")
+ raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params['num_classes']}")
config = {
"sample_size": image_size // vae_scale_factor,
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index d6aa1e11b6d2..f7fa82157c52 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -216,6 +216,7 @@
"AmusedInpaintPipeline",
"AmusedPipeline",
"AnimateDiffPipeline",
+ "AnimateDiffSDXLPipeline",
"AnimateDiffVideoToVideoPipeline",
"AudioLDM2Pipeline",
"AudioLDM2ProjectionModel",
@@ -595,6 +596,7 @@
AmusedInpaintPipeline,
AmusedPipeline,
AnimateDiffPipeline,
+ AnimateDiffSDXLPipeline,
AnimateDiffVideoToVideoPipeline,
AudioLDM2Pipeline,
AudioLDM2ProjectionModel,
diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index 8f4a1958975d..7d76687a3d1e 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -310,9 +310,9 @@ def load_config(
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -341,7 +341,7 @@ def load_config(
"""
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
local_files_only = kwargs.pop("local_files_only", False)
diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 4ccb9d77d627..0f4481570829 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -80,7 +80,6 @@ def __init__(
" if you intended to convert the image into RGB format, please set `do_convert_grayscale = False`.",
" if you intended to convert the image into grayscale format, please set `do_convert_rgb = False`",
)
- self.config.do_convert_rgb = False
@staticmethod
def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py
index b91d27f7d63e..36b022a26ec9 100644
--- a/src/diffusers/loaders/autoencoder.py
+++ b/src/diffusers/loaders/autoencoder.py
@@ -50,9 +50,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
is not used.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -99,7 +99,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
original_config_file = kwargs.pop("original_config_file", None)
config_file = kwargs.pop("config_file", None)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
force_download = kwargs.pop("force_download", False)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
diff --git a/src/diffusers/loaders/controlnet.py b/src/diffusers/loaders/controlnet.py
index d323f60aa7ee..53b9802d390e 100644
--- a/src/diffusers/loaders/controlnet.py
+++ b/src/diffusers/loaders/controlnet.py
@@ -50,9 +50,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
is not used.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -89,7 +89,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
"""
original_config_file = kwargs.pop("original_config_file", None)
config_file = kwargs.pop("config_file", None)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
force_download = kwargs.pop("force_download", False)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py
index 28a4334b199c..ef6a53e43196 100644
--- a/src/diffusers/loaders/ip_adapter.py
+++ b/src/diffusers/loaders/ip_adapter.py
@@ -90,9 +90,9 @@ def load_ip_adapter(
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -135,7 +135,7 @@ def load_ip_adapter(
# Load the main state dict first.
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
local_files_only = kwargs.pop("local_files_only", None)
token = kwargs.pop("token", None)
diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora.py
index 5d89658830f1..24be4999de27 100644
--- a/src/diffusers/loaders/lora.py
+++ b/src/diffusers/loaders/lora.py
@@ -176,9 +176,9 @@ def lora_state_dict(
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -208,7 +208,7 @@ def lora_state_dict(
# UNet and text encoder or both.
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
local_files_only = kwargs.pop("local_files_only", None)
token = kwargs.pop("token", None)
@@ -369,7 +369,11 @@ def _optionally_disable_offloading(cls, _pipeline):
if not is_model_cpu_offload:
is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
if not is_sequential_cpu_offload:
- is_sequential_cpu_offload = isinstance(component._hf_hook, AlignDevicesHook)
+ is_sequential_cpu_offload = (
+ isinstance(component._hf_hook, AlignDevicesHook)
+ or hasattr(component._hf_hook, "hooks")
+ and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+ )
logger.info(
"Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
@@ -1268,9 +1272,10 @@ def set_lora_device(self, adapter_names: List[str], device: Union[torch.device,
unet_module.lora_A[adapter_name].to(device)
unet_module.lora_B[adapter_name].to(device)
# this is a param, not a module, so device placement is not in-place -> re-assign
- unet_module.lora_magnitude_vector[adapter_name] = unet_module.lora_magnitude_vector[
- adapter_name
- ].to(device)
+ if hasattr(unet_module, "lora_magnitude_vector") and unet_module.lora_magnitude_vector is not None:
+ unet_module.lora_magnitude_vector[adapter_name] = unet_module.lora_magnitude_vector[
+ adapter_name
+ ].to(device)
# Handle the text encoder
modules_to_process = []
@@ -1288,9 +1293,13 @@ def set_lora_device(self, adapter_names: List[str], device: Union[torch.device,
text_encoder_module.lora_A[adapter_name].to(device)
text_encoder_module.lora_B[adapter_name].to(device)
# this is a param, not a module, so device placement is not in-place -> re-assign
- text_encoder_module.lora_magnitude_vector[
- adapter_name
- ] = text_encoder_module.lora_magnitude_vector[adapter_name].to(device)
+ if (
+ hasattr(text_encoder, "lora_magnitude_vector")
+ and text_encoder_module.lora_magnitude_vector is not None
+ ):
+ text_encoder_module.lora_magnitude_vector[
+ adapter_name
+ ] = text_encoder_module.lora_magnitude_vector[adapter_name].to(device)
class StableDiffusionXLLoraLoaderMixin(LoraLoaderMixin):
@@ -1397,6 +1406,9 @@ def save_lora_weights(
text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text
encoder LoRA state dict because it comes from 🤗 Transformers.
+ text_encoder_2_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+ State dict of the LoRA layers corresponding to the `text_encoder_2`. Must explicitly pass the text
+ encoder LoRA state dict because it comes from 🤗 Transformers.
is_main_process (`bool`, *optional*, defaults to `True`):
Whether the process calling this is the main process or not. Useful during distributed training and you
need to call this function on all processes. In this case, set `is_main_process=True` only on the main
@@ -1423,8 +1435,10 @@ def pack_weights(layers, prefix):
if unet_lora_layers:
state_dict.update(pack_weights(unet_lora_layers, "unet"))
- if text_encoder_lora_layers and text_encoder_2_lora_layers:
+ if text_encoder_lora_layers:
state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
+
+ if text_encoder_2_lora_layers:
state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
cls.write_lora_layers(
diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 752ef18c7a0b..d8ff92d0b0ff 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -177,9 +177,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
is not used.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -244,7 +244,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
```
"""
original_config_file = kwargs.pop("original_config_file", None)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
force_download = kwargs.pop("force_download", False)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 5b01b8da2b1f..c23e594c3976 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -305,7 +305,7 @@ def fetch_ldm_config_and_checkpoint(
pretrained_model_link_or_path,
class_name,
original_config_file=None,
- resume_download=False,
+ resume_download=None,
force_download=False,
proxies=None,
token=None,
diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py
index c1c224975cb8..a9b9a9aae052 100644
--- a/src/diffusers/loaders/textual_inversion.py
+++ b/src/diffusers/loaders/textual_inversion.py
@@ -38,7 +38,7 @@
def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs):
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
local_files_only = kwargs.pop("local_files_only", None)
token = kwargs.pop("token", None)
@@ -308,9 +308,9 @@ def load_textual_inversion(
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -423,7 +423,11 @@ def load_textual_inversion(
if isinstance(component, nn.Module):
if hasattr(component, "_hf_hook"):
is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
- is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+ is_sequential_cpu_offload = (
+ isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+ or hasattr(component._hf_hook, "hooks")
+ and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+ )
logger.info(
"Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again."
)
diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
index 294db44ee61d..5d5ed30dc35f 100644
--- a/src/diffusers/loaders/unet.py
+++ b/src/diffusers/loaders/unet.py
@@ -103,9 +103,9 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -149,7 +149,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
local_files_only = kwargs.pop("local_files_only", None)
token = kwargs.pop("token", None)
@@ -359,7 +359,11 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
for _, component in _pipeline.components.items():
if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
- is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+ is_sequential_cpu_offload = (
+ isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+ or hasattr(component._hf_hook, "hooks")
+ and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+ )
logger.info(
"Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
@@ -1086,9 +1090,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
is not used.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -1110,7 +1114,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
raise ValueError("FromOriginalUNetMixin is currently only compatible with StableCascadeUNet")
config = kwargs.pop("config", None)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
force_download = kwargs.pop("force_download", False)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
diff --git a/src/diffusers/models/activations.py b/src/diffusers/models/activations.py
index cec83bdded9e..f94b6c8d6d06 100644
--- a/src/diffusers/models/activations.py
+++ b/src/diffusers/models/activations.py
@@ -18,8 +18,12 @@
from torch import nn
from ..utils import deprecate
+from ..utils.import_utils import is_torch_npu_available
+if is_torch_npu_available():
+ import torch_npu
+
ACTIVATION_FUNCTIONS = {
"swish": nn.SiLU(),
"silu": nn.SiLU(),
@@ -98,9 +102,13 @@ def forward(self, hidden_states, *args, **kwargs):
if len(args) > 0 or kwargs.get("scale", None) is not None:
deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
deprecate("scale", "1.0.0", deprecation_message)
-
- hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
- return hidden_states * self.gelu(gate)
+ hidden_states = self.proj(hidden_states)
+ if is_torch_npu_available():
+ # using torch_npu.npu_geglu can run faster and save memory on NPU.
+ return torch_npu.npu_geglu(hidden_states, dim=-1, approximate=1)[0]
+ else:
+ hidden_states, gate = hidden_states.chunk(2, dim=-1)
+ return hidden_states * self.gelu(gate)
class ApproximateGELU(nn.Module):
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 429807989296..ea1c987e95c6 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
+import math
from importlib import import_module
from typing import Callable, List, Optional, Union
@@ -21,13 +22,15 @@
from ..image_processor import IPAdapterMaskProcessor
from ..utils import deprecate, logging
-from ..utils.import_utils import is_xformers_available
+from ..utils.import_utils import is_torch_npu_available, is_xformers_available
from ..utils.torch_utils import maybe_allow_in_graph
from .lora import LoRALinearLayer
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+if is_torch_npu_available():
+ import torch_npu
if is_xformers_available():
import xformers
@@ -209,6 +212,23 @@ def __init__(
)
self.set_processor(processor)
+ def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None:
+ r"""
+ Set whether to use npu flash attention from `torch_npu` or not.
+
+ """
+ if use_npu_flash_attention:
+ processor = AttnProcessorNPU()
+ else:
+ # set attention processor
+ # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+ # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+ # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+ processor = (
+ AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+ )
+ self.set_processor(processor)
+
def set_use_memory_efficient_attention_xformers(
self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
) -> None:
@@ -1207,6 +1227,116 @@ def __call__(
return hidden_states
+class AttnProcessorNPU:
+
+ r"""
+ Processor for implementing flash attention using torch_npu. Torch_npu supports only fp16 and bf16 data types. If
+ fp32 is used, F.scaled_dot_product_attention will be used for computation, but the acceleration effect on NPU is
+ not significant.
+
+ """
+
+ def __init__(self):
+ if not is_torch_npu_available():
+ raise ImportError("AttnProcessorNPU requires torch_npu extensions and is supported only on npu devices.")
+
+ def __call__(
+ self,
+ attn: Attention,
+ hidden_states: torch.FloatTensor,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ temb: Optional[torch.FloatTensor] = None,
+ *args,
+ **kwargs,
+ ) -> torch.FloatTensor:
+ if len(args) > 0 or kwargs.get("scale", None) is not None:
+ deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+ deprecate("scale", "1.0.0", deprecation_message)
+
+ residual = hidden_states
+ if attn.spatial_norm is not None:
+ hidden_states = attn.spatial_norm(hidden_states, temb)
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+
+ if attention_mask is not None:
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+ # scaled_dot_product_attention expects attention_mask shape to be
+ # (batch, heads, source_length, target_length)
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+ query = attn.to_q(hidden_states)
+
+ if encoder_hidden_states is None:
+ encoder_hidden_states = hidden_states
+ elif attn.norm_cross:
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+ key = attn.to_k(encoder_hidden_states)
+ value = attn.to_v(encoder_hidden_states)
+
+ inner_dim = key.shape[-1]
+ head_dim = inner_dim // attn.heads
+
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
+ if query.dtype in (torch.float16, torch.bfloat16):
+ hidden_states = torch_npu.npu_fusion_attention(
+ query,
+ key,
+ value,
+ attn.heads,
+ input_layout="BNSD",
+ pse=None,
+ atten_mask=attention_mask,
+ scale=1.0 / math.sqrt(query.shape[-1]),
+ pre_tockens=65536,
+ next_tockens=65536,
+ keep_prob=1.0,
+ sync=False,
+ inner_precise=0,
+ )[0]
+ else:
+ # TODO: add support for attn.scale when we move to Torch 2.1
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+ hidden_states = hidden_states.to(query.dtype)
+
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
+ return hidden_states
+
+
class AttnProcessor2_0:
r"""
Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py
index b286453de424..0b9b9d4d47e5 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -65,6 +65,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
"""
_supports_gradient_checkpointing = True
+ _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"]
@register_to_config
def __init__(
diff --git a/src/diffusers/models/modeling_flax_utils.py b/src/diffusers/models/modeling_flax_utils.py
index 1ddcda9005fc..151281070faa 100644
--- a/src/diffusers/models/modeling_flax_utils.py
+++ b/src/diffusers/models/modeling_flax_utils.py
@@ -245,9 +245,9 @@ def from_pretrained(
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -296,7 +296,7 @@ def from_pretrained(
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
from_pt = kwargs.pop("from_pt", False)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
local_files_only = kwargs.pop("local_files_only", False)
token = kwargs.pop("token", None)
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index c1fdff8ab356..373f5453aa23 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -57,7 +57,8 @@
if is_accelerate_available():
import accelerate
- from accelerate.utils import set_module_tensor_to_device
+ from accelerate import infer_auto_device_map
+ from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device
from accelerate.utils.versions import is_torch_version
@@ -99,6 +100,29 @@ def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
return first_tuple[1].dtype
+# Adapted from `transformers` (see modeling_utils.py)
+def _determine_device_map(model: "ModelMixin", device_map, max_memory, torch_dtype):
+ if isinstance(device_map, str):
+ no_split_modules = model._get_no_split_modules(device_map)
+ device_map_kwargs = {"no_split_module_classes": no_split_modules}
+
+ if device_map != "sequential":
+ max_memory = get_balanced_memory(
+ model,
+ dtype=torch_dtype,
+ low_zero=(device_map == "balanced_low_0"),
+ max_memory=max_memory,
+ **device_map_kwargs,
+ )
+ else:
+ max_memory = get_max_memory(max_memory)
+
+ device_map_kwargs["max_memory"] = max_memory
+ device_map = infer_auto_device_map(model, dtype=torch_dtype, **device_map_kwargs)
+
+ return device_map
+
+
def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None):
"""
Reads a checkpoint file, returning properly formatted errors if they arise.
@@ -201,6 +225,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
_automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
_supports_gradient_checkpointing = False
_keys_to_ignore_on_load_unexpected = None
+ _no_split_modules = None
def __init__(self):
super().__init__()
@@ -247,6 +272,36 @@ def disable_gradient_checkpointing(self) -> None:
if self._supports_gradient_checkpointing:
self.apply(partial(self._set_gradient_checkpointing, value=False))
+ def set_use_npu_flash_attention(self, valid: bool) -> None:
+ r"""
+ Set the switch for the npu flash attention.
+ """
+
+ def fn_recursive_set_npu_flash_attention(module: torch.nn.Module):
+ if hasattr(module, "set_use_npu_flash_attention"):
+ module.set_use_npu_flash_attention(valid)
+
+ for child in module.children():
+ fn_recursive_set_npu_flash_attention(child)
+
+ for module in self.children():
+ if isinstance(module, torch.nn.Module):
+ fn_recursive_set_npu_flash_attention(module)
+
+ def enable_npu_flash_attention(self) -> None:
+ r"""
+ Enable npu flash attention from torch_npu
+
+ """
+ self.set_use_npu_flash_attention(True)
+
+ def disable_npu_flash_attention(self) -> None:
+ r"""
+ disable npu flash attention from torch_npu
+
+ """
+ self.set_use_npu_flash_attention(False)
+
def set_use_memory_efficient_attention_xformers(
self, valid: bool, attention_op: Optional[Callable] = None
) -> None:
@@ -421,9 +476,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -505,7 +560,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
force_download = kwargs.pop("force_download", False)
from_flax = kwargs.pop("from_flax", False)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
output_loading_info = kwargs.pop("output_loading_info", False)
local_files_only = kwargs.pop("local_files_only", None)
@@ -560,6 +615,36 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
" dispatching. Please make sure to set `low_cpu_mem_usage=True`."
)
+ # change device_map into a map if we passed an int, a str or a torch.device
+ if isinstance(device_map, torch.device):
+ device_map = {"": device_map}
+ elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
+ try:
+ device_map = {"": torch.device(device_map)}
+ except RuntimeError:
+ raise ValueError(
+ "When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or "
+ f"'auto', 'balanced', 'balanced_low_0', 'sequential' but found {device_map}."
+ )
+ elif isinstance(device_map, int):
+ if device_map < 0:
+ raise ValueError(
+ "You can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' "
+ )
+ else:
+ device_map = {"": device_map}
+
+ if device_map is not None:
+ if low_cpu_mem_usage is None:
+ low_cpu_mem_usage = True
+ elif not low_cpu_mem_usage:
+ raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`")
+
+ if low_cpu_mem_usage:
+ if device_map is not None and not is_torch_version(">=", "1.10"):
+ # The max memory utils require PyTorch >= 1.10 to have torch.cuda.mem_get_info.
+ raise ValueError("`low_cpu_mem_usage` and `device_map` require PyTorch >= 1.10.")
+
# Load config if we don't provide a configuration
config_path = pretrained_model_name_or_path
@@ -582,10 +667,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
token=token,
revision=revision,
subfolder=subfolder,
- device_map=device_map,
- max_memory=max_memory,
- offload_folder=offload_folder,
- offload_state_dict=offload_state_dict,
user_agent=user_agent,
**kwargs,
)
@@ -690,6 +771,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
else: # else let accelerate handle loading and dispatching.
# Load weights and dispatch according to the device_map
# by default the device_map is None and the weights are loaded on the CPU
+ device_map = _determine_device_map(model, device_map, max_memory, torch_dtype)
try:
accelerate.load_checkpoint_and_dispatch(
model,
@@ -881,6 +963,36 @@ def _find_mismatched_keys(
return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
+ # Adapted from `transformers` modeling_utils.py
+ def _get_no_split_modules(self, device_map: str):
+ """
+ Get the modules of the model that should not be spit when using device_map. We iterate through the modules to
+ get the underlying `_no_split_modules`.
+
+ Args:
+ device_map (`str`):
+ The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"]
+
+ Returns:
+ `List[str]`: List of modules that should not be split
+ """
+ _no_split_modules = set()
+ modules_to_check = [self]
+ while len(modules_to_check) > 0:
+ module = modules_to_check.pop(-1)
+ # if the module does not appear in _no_split_modules, we also check the children
+ if module.__class__.__name__ not in _no_split_modules:
+ if isinstance(module, ModelMixin):
+ if module._no_split_modules is None:
+ raise ValueError(
+ f"{module.__class__.__name__} does not support `device_map='{device_map}'`. To implement support, the model "
+ "class needs to implement the `_no_split_modules` attribute."
+ )
+ else:
+ _no_split_modules = _no_split_modules | set(module._no_split_modules)
+ modules_to_check += list(module.children())
+ return list(_no_split_modules)
+
@property
def device(self) -> torch.device:
"""
diff --git a/src/diffusers/models/transformers/transformer_2d.py b/src/diffusers/models/transformers/transformer_2d.py
index 768fceb71ae5..6a2695b9e436 100644
--- a/src/diffusers/models/transformers/transformer_2d.py
+++ b/src/diffusers/models/transformers/transformer_2d.py
@@ -72,6 +72,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
"""
_supports_gradient_checkpointing = True
+ _no_split_modules = ["BasicTransformerBlock"]
@register_to_config
def __init__(
diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py
index 34327e1049c5..697730b359ff 100644
--- a/src/diffusers/models/unets/unet_2d_condition.py
+++ b/src/diffusers/models/unets/unet_2d_condition.py
@@ -161,6 +161,7 @@ class conditioning with `class_embed_type` equal to `None`.
"""
_supports_gradient_checkpointing = True
+ _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D", "CrossAttnUpBlock2D"]
@register_to_config
def __init__(
diff --git a/src/diffusers/models/unets/unet_3d_blocks.py b/src/diffusers/models/unets/unet_3d_blocks.py
index 97c91f61da1c..75827258f64c 100644
--- a/src/diffusers/models/unets/unet_3d_blocks.py
+++ b/src/diffusers/models/unets/unet_3d_blocks.py
@@ -121,6 +121,7 @@ def get_down_block(
raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockMotion")
return CrossAttnDownBlockMotion(
num_layers=num_layers,
+ transformer_layers_per_block=transformer_layers_per_block,
in_channels=in_channels,
out_channels=out_channels,
temb_channels=temb_channels,
@@ -255,6 +256,7 @@ def get_up_block(
raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockMotion")
return CrossAttnUpBlockMotion(
num_layers=num_layers,
+ transformer_layers_per_block=transformer_layers_per_block,
in_channels=in_channels,
out_channels=out_channels,
prev_output_channel=prev_output_channel,
diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py
index 595b7b03571c..81cc4b1f7ad4 100644
--- a/src/diffusers/models/unets/unet_motion_model.py
+++ b/src/diffusers/models/unets/unet_motion_model.py
@@ -211,6 +211,8 @@ def __init__(
norm_num_groups: int = 32,
norm_eps: float = 1e-5,
cross_attention_dim: int = 1280,
+ transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+ reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
use_linear_projection: bool = False,
num_attention_heads: Union[int, Tuple[int, ...]] = 8,
motion_max_seq_length: int = 32,
@@ -218,6 +220,9 @@ def __init__(
use_motion_mid_block: int = True,
encoder_hid_dim: Optional[int] = None,
encoder_hid_dim_type: Optional[str] = None,
+ addition_embed_type: Optional[str] = None,
+ addition_time_embed_dim: Optional[int] = None,
+ projection_class_embeddings_input_dim: Optional[int] = None,
time_cond_proj_dim: Optional[int] = None,
):
super().__init__()
@@ -240,6 +245,21 @@ def __init__(
f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
)
+ if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+ raise ValueError(
+ f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+ )
+
+ if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+ raise ValueError(
+ f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+ )
+
+ if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
+ for layer_number_per_block in transformer_layers_per_block:
+ if isinstance(layer_number_per_block, list):
+ raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
+
# input
conv_in_kernel = 3
conv_out_kernel = 3
@@ -260,6 +280,10 @@ def __init__(
if encoder_hid_dim_type is None:
self.encoder_hid_proj = None
+ if addition_embed_type == "text_time":
+ self.add_time_proj = Timesteps(addition_time_embed_dim, True, 0)
+ self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+
# class embedding
self.down_blocks = nn.ModuleList([])
self.up_blocks = nn.ModuleList([])
@@ -267,6 +291,15 @@ def __init__(
if isinstance(num_attention_heads, int):
num_attention_heads = (num_attention_heads,) * len(down_block_types)
+ if isinstance(cross_attention_dim, int):
+ cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+ if isinstance(layers_per_block, int):
+ layers_per_block = [layers_per_block] * len(down_block_types)
+
+ if isinstance(transformer_layers_per_block, int):
+ transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
# down
output_channel = block_out_channels[0]
for i, down_block_type in enumerate(down_block_types):
@@ -276,7 +309,7 @@ def __init__(
down_block = get_down_block(
down_block_type,
- num_layers=layers_per_block,
+ num_layers=layers_per_block[i],
in_channels=input_channel,
out_channels=output_channel,
temb_channels=time_embed_dim,
@@ -284,13 +317,14 @@ def __init__(
resnet_eps=norm_eps,
resnet_act_fn=act_fn,
resnet_groups=norm_num_groups,
- cross_attention_dim=cross_attention_dim,
+ cross_attention_dim=cross_attention_dim[i],
num_attention_heads=num_attention_heads[i],
downsample_padding=downsample_padding,
use_linear_projection=use_linear_projection,
dual_cross_attention=False,
temporal_num_attention_heads=motion_num_attention_heads,
temporal_max_seq_length=motion_max_seq_length,
+ transformer_layers_per_block=transformer_layers_per_block[i],
)
self.down_blocks.append(down_block)
@@ -302,13 +336,14 @@ def __init__(
resnet_eps=norm_eps,
resnet_act_fn=act_fn,
output_scale_factor=mid_block_scale_factor,
- cross_attention_dim=cross_attention_dim,
+ cross_attention_dim=cross_attention_dim[-1],
num_attention_heads=num_attention_heads[-1],
resnet_groups=norm_num_groups,
dual_cross_attention=False,
use_linear_projection=use_linear_projection,
temporal_num_attention_heads=motion_num_attention_heads,
temporal_max_seq_length=motion_max_seq_length,
+ transformer_layers_per_block=transformer_layers_per_block[-1],
)
else:
@@ -318,11 +353,12 @@ def __init__(
resnet_eps=norm_eps,
resnet_act_fn=act_fn,
output_scale_factor=mid_block_scale_factor,
- cross_attention_dim=cross_attention_dim,
+ cross_attention_dim=cross_attention_dim[-1],
num_attention_heads=num_attention_heads[-1],
resnet_groups=norm_num_groups,
dual_cross_attention=False,
use_linear_projection=use_linear_projection,
+ transformer_layers_per_block=transformer_layers_per_block[-1],
)
# count how many layers upsample the images
@@ -331,6 +367,9 @@ def __init__(
# up
reversed_block_out_channels = list(reversed(block_out_channels))
reversed_num_attention_heads = list(reversed(num_attention_heads))
+ reversed_layers_per_block = list(reversed(layers_per_block))
+ reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+ reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
output_channel = reversed_block_out_channels[0]
for i, up_block_type in enumerate(up_block_types):
@@ -349,7 +388,7 @@ def __init__(
up_block = get_up_block(
up_block_type,
- num_layers=layers_per_block + 1,
+ num_layers=reversed_layers_per_block[i] + 1,
in_channels=input_channel,
out_channels=output_channel,
prev_output_channel=prev_output_channel,
@@ -358,13 +397,14 @@ def __init__(
resnet_eps=norm_eps,
resnet_act_fn=act_fn,
resnet_groups=norm_num_groups,
- cross_attention_dim=cross_attention_dim,
+ cross_attention_dim=reversed_cross_attention_dim[i],
num_attention_heads=reversed_num_attention_heads[i],
dual_cross_attention=False,
resolution_idx=i,
use_linear_projection=use_linear_projection,
temporal_num_attention_heads=motion_num_attention_heads,
temporal_max_seq_length=motion_max_seq_length,
+ transformer_layers_per_block=reversed_transformer_layers_per_block[i],
)
self.up_blocks.append(up_block)
prev_output_channel = output_channel
@@ -835,6 +875,28 @@ def forward(
t_emb = t_emb.to(dtype=self.dtype)
emb = self.time_embedding(t_emb, timestep_cond)
+ aug_emb = None
+
+ if self.config.addition_embed_type == "text_time":
+ if "text_embeds" not in added_cond_kwargs:
+ raise ValueError(
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+ )
+
+ text_embeds = added_cond_kwargs.get("text_embeds")
+ if "time_ids" not in added_cond_kwargs:
+ raise ValueError(
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+ )
+ time_ids = added_cond_kwargs.get("time_ids")
+ time_embeds = self.add_time_proj(time_ids.flatten())
+ time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+
+ add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+ add_embeds = add_embeds.to(emb.dtype)
+ aug_emb = self.add_embedding(add_embeds)
+
+ emb = emb if aug_emb is None else emb + aug_emb
emb = emb.repeat_interleave(repeats=num_frames, dim=0)
encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 169e44c9e3cd..c2dd7ac0d551 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -114,6 +114,7 @@
_import_structure["amused"] = ["AmusedImg2ImgPipeline", "AmusedInpaintPipeline", "AmusedPipeline"]
_import_structure["animatediff"] = [
"AnimateDiffPipeline",
+ "AnimateDiffSDXLPipeline",
"AnimateDiffVideoToVideoPipeline",
]
_import_structure["audioldm"] = ["AudioLDMPipeline"]
@@ -367,7 +368,7 @@
from ..utils.dummy_torch_and_transformers_objects import *
else:
from .amused import AmusedImg2ImgPipeline, AmusedInpaintPipeline, AmusedPipeline
- from .animatediff import AnimateDiffPipeline, AnimateDiffVideoToVideoPipeline
+ from .animatediff import AnimateDiffPipeline, AnimateDiffSDXLPipeline, AnimateDiffVideoToVideoPipeline
from .audioldm import AudioLDMPipeline
from .audioldm2 import (
AudioLDM2Pipeline,
diff --git a/src/diffusers/pipelines/animatediff/__init__.py b/src/diffusers/pipelines/animatediff/__init__.py
index 35b99a76fd21..ae6b67b1924c 100644
--- a/src/diffusers/pipelines/animatediff/__init__.py
+++ b/src/diffusers/pipelines/animatediff/__init__.py
@@ -22,6 +22,7 @@
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
_import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline"]
+ _import_structure["pipeline_animatediff_sdxl"] = ["AnimateDiffSDXLPipeline"]
_import_structure["pipeline_animatediff_video2video"] = ["AnimateDiffVideoToVideoPipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -33,6 +34,7 @@
else:
from .pipeline_animatediff import AnimateDiffPipeline
+ from .pipeline_animatediff_sdxl import AnimateDiffSDXLPipeline
from .pipeline_animatediff_video2video import AnimateDiffVideoToVideoPipeline
from .pipeline_output import AnimateDiffPipelineOutput
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py
new file mode 100644
index 000000000000..f15cd5dbebf7
--- /dev/null
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py
@@ -0,0 +1,1284 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import (
+ CLIPImageProcessor,
+ CLIPTextModel,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+ CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+ FromSingleFileMixin,
+ IPAdapterMixin,
+ StableDiffusionXLLoraLoaderMixin,
+ TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, MotionAdapter, UNet2DConditionModel, UNetMotionModel
+from ...models.attention_processor import (
+ AttnProcessor2_0,
+ FusedAttnProcessor2_0,
+ LoRAAttnProcessor2_0,
+ LoRAXFormersAttnProcessor,
+ XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import (
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+)
+from ...utils import (
+ USE_PEFT_BACKEND,
+ logging,
+ replace_example_docstring,
+ scale_lora_layers,
+ unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..free_init_utils import FreeInitMixin
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import AnimateDiffPipelineOutput
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+ Examples:
+ ```py
+ >>> import torch
+ >>> from diffusers.models import MotionAdapter
+ >>> from diffusers import AnimateDiffSDXLPipeline, DDIMScheduler
+ >>> from diffusers.utils import export_to_gif
+
+ >>> adapter = MotionAdapter.from_pretrained(
+ ... "a-r-r-o-w/animatediff-motion-adapter-sdxl-beta", torch_dtype=torch.float16
+ ... )
+
+ >>> model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+ >>> scheduler = DDIMScheduler.from_pretrained(
+ ... model_id,
+ ... subfolder="scheduler",
+ ... clip_sample=False,
+ ... timestep_spacing="linspace",
+ ... beta_schedule="linear",
+ ... steps_offset=1,
+ ... )
+ >>> pipe = AnimateDiffSDXLPipeline.from_pretrained(
+ ... model_id,
+ ... motion_adapter=adapter,
+ ... scheduler=scheduler,
+ ... torch_dtype=torch.float16,
+ ... variant="fp16",
+ ... ).to("cuda")
+
+ >>> # enable memory savings
+ >>> pipe.enable_vae_slicing()
+ >>> pipe.enable_vae_tiling()
+
+ >>> output = pipe(
+ ... prompt="a panda surfing in the ocean, realistic, high quality",
+ ... negative_prompt="low quality, worst quality",
+ ... num_inference_steps=20,
+ ... guidance_scale=8,
+ ... width=1024,
+ ... height=1024,
+ ... num_frames=16,
+ ... )
+
+ >>> frames = output.frames[0]
+ >>> export_to_gif(frames, "animation.gif")
+ ```
+"""
+
+
+# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
+def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
+ batch_size, channels, num_frames, height, width = video.shape
+ outputs = []
+ for batch_idx in range(batch_size):
+ batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+ batch_output = processor.postprocess(batch_vid, output_type)
+
+ outputs.append(batch_output)
+
+ if output_type == "np":
+ outputs = np.stack(outputs)
+
+ elif output_type == "pt":
+ outputs = torch.stack(outputs)
+
+ elif not output_type == "pil":
+ raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
+
+ return outputs
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+ """
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+ """
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+ # rescale the results from guidance (fixes overexposure)
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+ return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+ scheduler,
+ num_inference_steps: Optional[int] = None,
+ device: Optional[Union[str, torch.device]] = None,
+ timesteps: Optional[List[int]] = None,
+ **kwargs,
+):
+ """
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+ Args:
+ scheduler (`SchedulerMixin`):
+ The scheduler to get timesteps from.
+ num_inference_steps (`int`):
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+ must be `None`.
+ device (`str` or `torch.device`, *optional*):
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ timesteps (`List[int]`, *optional*):
+ Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+ timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+ must be `None`.
+
+ Returns:
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+ second element is the number of inference steps.
+ """
+ if timesteps is not None:
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+ if not accepts_timesteps:
+ raise ValueError(
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+ f" timestep schedules. Please check whether you are using the correct scheduler."
+ )
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+ timesteps = scheduler.timesteps
+ num_inference_steps = len(timesteps)
+ else:
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+ timesteps = scheduler.timesteps
+ return timesteps, num_inference_steps
+
+
+class AnimateDiffSDXLPipeline(
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ FromSingleFileMixin,
+ StableDiffusionXLLoraLoaderMixin,
+ TextualInversionLoaderMixin,
+ IPAdapterMixin,
+ FreeInitMixin,
+):
+ r"""
+ Pipeline for text-to-video generation using Stable Diffusion XL.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ The pipeline also inherits the following loading methods:
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+ - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+ - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+ - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion XL uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ text_encoder_2 ([` CLIPTextModelWithProjection`]):
+ Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+ specifically the
+ [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+ variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ tokenizer_2 (`CLIPTokenizer`):
+ Second Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]):
+ Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+ Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+ `stabilityai/stable-diffusion-xl-base-1-0`.
+ """
+
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+ _optional_components = [
+ "tokenizer",
+ "tokenizer_2",
+ "text_encoder",
+ "text_encoder_2",
+ "image_encoder",
+ "feature_extractor",
+ ]
+ _callback_tensor_inputs = [
+ "latents",
+ "prompt_embeds",
+ "negative_prompt_embeds",
+ "add_text_embeds",
+ "add_time_ids",
+ "negative_pooled_prompt_embeds",
+ "negative_add_time_ids",
+ ]
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ text_encoder_2: CLIPTextModelWithProjection,
+ tokenizer: CLIPTokenizer,
+ tokenizer_2: CLIPTokenizer,
+ unet: Union[UNet2DConditionModel, UNetMotionModel],
+ motion_adapter: MotionAdapter,
+ scheduler: Union[
+ DDIMScheduler,
+ PNDMScheduler,
+ LMSDiscreteScheduler,
+ EulerDiscreteScheduler,
+ EulerAncestralDiscreteScheduler,
+ DPMSolverMultistepScheduler,
+ ],
+ image_encoder: CLIPVisionModelWithProjection = None,
+ feature_extractor: CLIPImageProcessor = None,
+ force_zeros_for_empty_prompt: bool = True,
+ ):
+ super().__init__()
+
+ if isinstance(unet, UNet2DConditionModel):
+ unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ text_encoder_2=text_encoder_2,
+ tokenizer=tokenizer,
+ tokenizer_2=tokenizer_2,
+ unet=unet,
+ motion_adapter=motion_adapter,
+ scheduler=scheduler,
+ image_encoder=image_encoder,
+ feature_extractor=feature_extractor,
+ )
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+ self.default_sample_size = self.unet.config.sample_size
+
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt with num_images_per_prompt->num_videos_per_prompt
+ def encode_prompt(
+ self,
+ prompt: str,
+ prompt_2: Optional[str] = None,
+ device: Optional[torch.device] = None,
+ num_videos_per_prompt: int = 1,
+ do_classifier_free_guidance: bool = True,
+ negative_prompt: Optional[str] = None,
+ negative_prompt_2: Optional[str] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+ lora_scale: Optional[float] = None,
+ clip_skip: Optional[int] = None,
+ ):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ prompt to be encoded
+ prompt_2 (`str` or `List[str]`, *optional*):
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+ used in both text-encoders
+ device: (`torch.device`):
+ torch device
+ num_videos_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+ input argument.
+ lora_scale (`float`, *optional*):
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+ clip_skip (`int`, *optional*):
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+ the output of the pre-final layer will be used for computing the prompt embeddings.
+ """
+ device = device or self._execution_device
+
+ # set lora scale so that monkey patched LoRA
+ # function of text encoder can correctly access it
+ if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+ self._lora_scale = lora_scale
+
+ # dynamically adjust the LoRA scale
+ if self.text_encoder is not None:
+ if not USE_PEFT_BACKEND:
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+ else:
+ scale_lora_layers(self.text_encoder, lora_scale)
+
+ if self.text_encoder_2 is not None:
+ if not USE_PEFT_BACKEND:
+ adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+ else:
+ scale_lora_layers(self.text_encoder_2, lora_scale)
+
+ prompt = [prompt] if isinstance(prompt, str) else prompt
+
+ if prompt is not None:
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ # Define tokenizers and text encoders
+ tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+ text_encoders = (
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+ )
+
+ if prompt_embeds is None:
+ prompt_2 = prompt_2 or prompt
+ prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+ # textual inversion: process multi-vector tokens if necessary
+ prompt_embeds_list = []
+ prompts = [prompt, prompt_2]
+ for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+ if isinstance(self, TextualInversionLoaderMixin):
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+ text_inputs = tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+
+ text_input_ids = text_inputs.input_ids
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {tokenizer.model_max_length} tokens: {removed_text}"
+ )
+
+ prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+ # We are only ALWAYS interested in the pooled output of the final text encoder
+ pooled_prompt_embeds = prompt_embeds[0]
+ if clip_skip is None:
+ prompt_embeds = prompt_embeds.hidden_states[-2]
+ else:
+ # "2" because SDXL always indexes from the penultimate layer.
+ prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+ prompt_embeds_list.append(prompt_embeds)
+
+ prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+ # get unconditional embeddings for classifier free guidance
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
+ negative_prompt = negative_prompt or ""
+ negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+ # normalize str to list
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+ negative_prompt_2 = (
+ batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+ )
+
+ uncond_tokens: List[str]
+ if prompt is not None and type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = [negative_prompt, negative_prompt_2]
+
+ negative_prompt_embeds_list = []
+ for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+ if isinstance(self, TextualInversionLoaderMixin):
+ negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+ max_length = prompt_embeds.shape[1]
+ uncond_input = tokenizer(
+ negative_prompt,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+
+ negative_prompt_embeds = text_encoder(
+ uncond_input.input_ids.to(device),
+ output_hidden_states=True,
+ )
+ # We are only ALWAYS interested in the pooled output of the final text encoder
+ negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+ negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+ negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+ if self.text_encoder_2 is not None:
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+ else:
+ prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+ bs_embed, seq_len, _ = prompt_embeds.shape
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+ prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+
+ if do_classifier_free_guidance:
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = negative_prompt_embeds.shape[1]
+
+ if self.text_encoder_2 is not None:
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+ else:
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+ pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_videos_per_prompt).view(
+ bs_embed * num_videos_per_prompt, -1
+ )
+ if do_classifier_free_guidance:
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_videos_per_prompt).view(
+ bs_embed * num_videos_per_prompt, -1
+ )
+
+ if self.text_encoder is not None:
+ if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+ # Retrieve the original scale by scaling back the LoRA layers
+ unscale_lora_layers(self.text_encoder, lora_scale)
+
+ if self.text_encoder_2 is not None:
+ if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+ # Retrieve the original scale by scaling back the LoRA layers
+ unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+ def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+ dtype = next(self.image_encoder.parameters()).dtype
+
+ if not isinstance(image, torch.Tensor):
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+ image = image.to(device=device, dtype=dtype)
+ if output_hidden_states:
+ image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+ image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+ uncond_image_enc_hidden_states = self.image_encoder(
+ torch.zeros_like(image), output_hidden_states=True
+ ).hidden_states[-2]
+ uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+ num_images_per_prompt, dim=0
+ )
+ return image_enc_hidden_states, uncond_image_enc_hidden_states
+ else:
+ image_embeds = self.image_encoder(image).image_embeds
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+ uncond_image_embeds = torch.zeros_like(image_embeds)
+
+ return image_embeds, uncond_image_embeds
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+ def prepare_ip_adapter_image_embeds(
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+ ):
+ if ip_adapter_image_embeds is None:
+ if not isinstance(ip_adapter_image, list):
+ ip_adapter_image = [ip_adapter_image]
+
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+ raise ValueError(
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+ )
+
+ image_embeds = []
+ for single_ip_adapter_image, image_proj_layer in zip(
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+ ):
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
+ single_ip_adapter_image, device, 1, output_hidden_state
+ )
+ single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+ single_negative_image_embeds = torch.stack(
+ [single_negative_image_embeds] * num_images_per_prompt, dim=0
+ )
+
+ if do_classifier_free_guidance:
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+ single_image_embeds = single_image_embeds.to(device)
+
+ image_embeds.append(single_image_embeds)
+ else:
+ repeat_dims = [1]
+ image_embeds = []
+ for single_image_embeds in ip_adapter_image_embeds:
+ if do_classifier_free_guidance:
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+ single_image_embeds = single_image_embeds.repeat(
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+ )
+ single_negative_image_embeds = single_negative_image_embeds.repeat(
+ num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+ )
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+ else:
+ single_image_embeds = single_image_embeds.repeat(
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+ )
+ image_embeds.append(single_image_embeds)
+
+ return image_embeds
+
+ # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+ def decode_latents(self, latents):
+ latents = 1 / self.vae.config.scaling_factor * latents
+
+ batch_size, channels, num_frames, height, width = latents.shape
+ latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+ image = self.vae.decode(latents).sample
+ video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+ video = video.float()
+ return video
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+ def prepare_extra_step_kwargs(self, generator, eta):
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+ return extra_step_kwargs
+
+ def check_inputs(
+ self,
+ prompt,
+ prompt_2,
+ height,
+ width,
+ negative_prompt=None,
+ negative_prompt_2=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ pooled_prompt_embeds=None,
+ negative_pooled_prompt_embeds=None,
+ callback_on_step_end_tensor_inputs=None,
+ ):
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+ if callback_on_step_end_tensor_inputs is not None and not all(
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+ ):
+ raise ValueError(
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+ )
+
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt_2 is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
+ raise ValueError(
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+ )
+
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+ raise ValueError(
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+ )
+
+ # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+ def prepare_latents(
+ self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+ ):
+ shape = (
+ batch_size,
+ num_channels_latents,
+ num_frames,
+ height // self.vae_scale_factor,
+ width // self.vae_scale_factor,
+ )
+ if isinstance(generator, list) and len(generator) != batch_size:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ )
+
+ if latents is None:
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ else:
+ latents = latents.to(device)
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+ return latents
+
+ def _get_add_time_ids(
+ self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+ ):
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+ passed_add_embed_dim = (
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+ )
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+ if expected_add_embed_dim != passed_add_embed_dim:
+ raise ValueError(
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+ )
+
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+ return add_time_ids
+
+ def upcast_vae(self):
+ dtype = self.vae.dtype
+ self.vae.to(dtype=torch.float32)
+ use_torch_2_0_or_xformers = isinstance(
+ self.vae.decoder.mid_block.attentions[0].processor,
+ (
+ AttnProcessor2_0,
+ XFormersAttnProcessor,
+ LoRAXFormersAttnProcessor,
+ LoRAAttnProcessor2_0,
+ FusedAttnProcessor2_0,
+ ),
+ )
+ # if xformers or torch_2_0 is used attention block does not need
+ # to be in float32 which can save lots of memory
+ if use_torch_2_0_or_xformers:
+ self.vae.post_quant_conv.to(dtype)
+ self.vae.decoder.conv_in.to(dtype)
+ self.vae.decoder.mid_block.to(dtype)
+
+ # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+ def get_guidance_scale_embedding(
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+ ) -> torch.FloatTensor:
+ """
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+ Args:
+ w (`torch.Tensor`):
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+ embedding_dim (`int`, *optional*, defaults to 512):
+ Dimension of the embeddings to generate.
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+ Data type of the generated embeddings.
+
+ Returns:
+ `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+ """
+ assert len(w.shape) == 1
+ w = w * 1000.0
+
+ half_dim = embedding_dim // 2
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+ emb = w.to(dtype)[:, None] * emb[None, :]
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+ if embedding_dim % 2 == 1: # zero pad
+ emb = torch.nn.functional.pad(emb, (0, 1))
+ assert emb.shape == (w.shape[0], embedding_dim)
+ return emb
+
+ @property
+ def guidance_scale(self):
+ return self._guidance_scale
+
+ @property
+ def guidance_rescale(self):
+ return self._guidance_rescale
+
+ @property
+ def clip_skip(self):
+ return self._clip_skip
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ @property
+ def do_classifier_free_guidance(self):
+ return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+ @property
+ def cross_attention_kwargs(self):
+ return self._cross_attention_kwargs
+
+ @property
+ def denoising_end(self):
+ return self._denoising_end
+
+ @property
+ def num_timesteps(self):
+ return self._num_timesteps
+
+ @property
+ def interrupt(self):
+ return self._interrupt
+
+ @torch.no_grad()
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
+ def __call__(
+ self,
+ prompt: Union[str, List[str]] = None,
+ prompt_2: Optional[Union[str, List[str]]] = None,
+ num_frames: int = 16,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ timesteps: List[int] = None,
+ denoising_end: Optional[float] = None,
+ guidance_scale: float = 5.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
+ num_videos_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+ ip_adapter_image: Optional[PipelineImageInput] = None,
+ ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ guidance_rescale: float = 0.0,
+ original_size: Optional[Tuple[int, int]] = None,
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
+ target_size: Optional[Tuple[int, int]] = None,
+ negative_original_size: Optional[Tuple[int, int]] = None,
+ negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+ negative_target_size: Optional[Tuple[int, int]] = None,
+ clip_skip: Optional[int] = None,
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the video generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ prompt_2 (`str` or `List[str]`, *optional*):
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+ used in both text-encoders
+ num_frames:
+ The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+ amounts to 2 seconds of video.
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The height in pixels of the generated video. This is set to 1024 by default for the best results.
+ Anything below 512 pixels won't work well for
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+ and checkpoints that are not specifically fine-tuned on low resolutions.
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+ The width in pixels of the generated video. This is set to 1024 by default for the best results.
+ Anything below 512 pixels won't work well for
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+ and checkpoints that are not specifically fine-tuned on low resolutions.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+ expense of slower inference.
+ timesteps (`List[int]`, *optional*):
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+ passed will be used. Must be in descending order.
+ denoising_end (`float`, *optional*):
+ When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+ completed before it is intentionally prematurely terminated. As a result, the returned sample will
+ still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+ scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+ "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+ guidance_scale (`float`, *optional*, defaults to 5.0):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower video quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the video generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the video generation to be sent to `tokenizer_2` and
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
+ The number of videos to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+ input argument.
+ ip_adapter_image: (`PipelineImageInput`, *optional*):
+ Optional image input to work with IP Adapters.
+ ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+ Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the
+ `ip_adapter_image` input argument.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generated video. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion_xl.AnimateDiffPipelineOutput`] instead of a
+ plain tuple.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+ `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+ explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
+ not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+ negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+ To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+ micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+ negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+ To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+ micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+ negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+ To negatively condition the generation process based on a target image resolution. It should be as same
+ as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+ callback_on_step_end (`Callable`, *optional*):
+ A function that calls at the end of each denoising steps during the inference. The function is called
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+ `callback_on_step_end_tensor_inputs`.
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+ `._callback_tensor_inputs` attribute of your pipeline class.
+
+ Examples:
+
+ Returns:
+ [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+ If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+ returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+ """
+
+ # 0. Default height and width to unet
+ height = height or self.default_sample_size * self.vae_scale_factor
+ width = width or self.default_sample_size * self.vae_scale_factor
+
+ num_videos_per_prompt = 1
+
+ original_size = original_size or (height, width)
+ target_size = target_size or (height, width)
+
+ # 1. Check inputs. Raise error if not correct
+ self.check_inputs(
+ prompt,
+ prompt_2,
+ height,
+ width,
+ negative_prompt,
+ negative_prompt_2,
+ prompt_embeds,
+ negative_prompt_embeds,
+ pooled_prompt_embeds,
+ negative_pooled_prompt_embeds,
+ callback_on_step_end_tensor_inputs,
+ )
+
+ self._guidance_scale = guidance_scale
+ self._guidance_rescale = guidance_rescale
+ self._clip_skip = clip_skip
+ self._cross_attention_kwargs = cross_attention_kwargs
+ self._denoising_end = denoising_end
+ self._interrupt = False
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ device = self._execution_device
+
+ # 3. Encode input prompt
+ lora_scale = (
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+ )
+
+ (
+ prompt_embeds,
+ negative_prompt_embeds,
+ pooled_prompt_embeds,
+ negative_pooled_prompt_embeds,
+ ) = self.encode_prompt(
+ prompt=prompt,
+ prompt_2=prompt_2,
+ device=device,
+ num_videos_per_prompt=num_videos_per_prompt,
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
+ negative_prompt=negative_prompt,
+ negative_prompt_2=negative_prompt_2,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ pooled_prompt_embeds=pooled_prompt_embeds,
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+ lora_scale=lora_scale,
+ clip_skip=self.clip_skip,
+ )
+
+ # 4. Prepare timesteps
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+ # 5. Prepare latent variables
+ num_channels_latents = self.unet.config.in_channels
+ latents = self.prepare_latents(
+ batch_size * num_videos_per_prompt,
+ num_channels_latents,
+ num_frames,
+ height,
+ width,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ latents,
+ )
+
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 7. Prepare added time ids & embeddings
+ add_text_embeds = pooled_prompt_embeds
+ if self.text_encoder_2 is None:
+ text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+ else:
+ text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+ add_time_ids = self._get_add_time_ids(
+ original_size,
+ crops_coords_top_left,
+ target_size,
+ dtype=prompt_embeds.dtype,
+ text_encoder_projection_dim=text_encoder_projection_dim,
+ )
+ if negative_original_size is not None and negative_target_size is not None:
+ negative_add_time_ids = self._get_add_time_ids(
+ negative_original_size,
+ negative_crops_coords_top_left,
+ negative_target_size,
+ dtype=prompt_embeds.dtype,
+ text_encoder_projection_dim=text_encoder_projection_dim,
+ )
+ else:
+ negative_add_time_ids = add_time_ids
+
+ if self.do_classifier_free_guidance:
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+ add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+ prompt_embeds = prompt_embeds.to(device)
+ add_text_embeds = add_text_embeds.to(device)
+ add_time_ids = add_time_ids.to(device).repeat(batch_size * num_videos_per_prompt, 1)
+
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+ image_embeds = self.prepare_ip_adapter_image_embeds(
+ ip_adapter_image,
+ ip_adapter_image_embeds,
+ device,
+ batch_size * num_videos_per_prompt,
+ self.do_classifier_free_guidance,
+ )
+
+ # 7.1 Apply denoising_end
+ if (
+ self.denoising_end is not None
+ and isinstance(self.denoising_end, float)
+ and self.denoising_end > 0
+ and self.denoising_end < 1
+ ):
+ discrete_timestep_cutoff = int(
+ round(
+ self.scheduler.config.num_train_timesteps
+ - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+ )
+ )
+ num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+ timesteps = timesteps[:num_inference_steps]
+
+ # 8. Optionally get Guidance Scale Embedding
+ timestep_cond = None
+ if self.unet.config.time_cond_proj_dim is not None:
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_videos_per_prompt)
+ timestep_cond = self.get_guidance_scale_embedding(
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+ ).to(device=device, dtype=latents.dtype)
+
+ num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+ for free_init_iter in range(num_free_init_iters):
+ if self.free_init_enabled:
+ latents, timesteps = self._apply_free_init(
+ latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+ )
+
+ self._num_timesteps = len(timesteps)
+
+ # 9. Denoising loop
+ with self.progress_bar(total=self._num_timesteps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ if self.interrupt:
+ continue
+
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ # predict the noise residual
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+ if ip_adapter_image is not None or ip_adapter_image_embeds:
+ added_cond_kwargs["image_embeds"] = image_embeds
+
+ noise_pred = self.unet(
+ latent_model_input,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ timestep_cond=timestep_cond,
+ cross_attention_kwargs=self.cross_attention_kwargs,
+ added_cond_kwargs=added_cond_kwargs,
+ return_dict=False,
+ )[0]
+
+ # perform guidance
+ if self.do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+ noise_pred = rescale_noise_cfg(
+ noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
+ )
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+ if callback_on_step_end is not None:
+ callback_kwargs = {}
+ for k in callback_on_step_end_tensor_inputs:
+ callback_kwargs[k] = locals()[k]
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+ latents = callback_outputs.pop("latents", latents)
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+ add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+ negative_pooled_prompt_embeds = callback_outputs.pop(
+ "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+ )
+ add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+ negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+ progress_bar.update()
+
+ # make sure the VAE is in float32 mode, as it overflows in float16
+ needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+ if needs_upcasting:
+ self.upcast_vae()
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+ # 10. Post processing
+ if output_type == "latent":
+ video = latents
+ else:
+ video_tensor = self.decode_latents(latents)
+ video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+ # cast back to fp16 if needed
+ if needs_upcasting:
+ self.vae.to(dtype=torch.float16)
+
+ # 11. Offload all models
+ self.maybe_free_model_hooks()
+
+ if not return_dict:
+ return (video,)
+
+ return AnimateDiffPipelineOutput(frames=video)
diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py
index 79ee3fdad461..5fb497ef2e22 100644
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -45,7 +45,7 @@
)
from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
-from .pixart_alpha import PixArtAlphaPipeline
+from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline
from .stable_diffusion import (
StableDiffusionImg2ImgPipeline,
@@ -73,7 +73,8 @@
("wuerstchen", WuerstchenCombinedPipeline),
("cascade", StableCascadeCombinedPipeline),
("lcm", LatentConsistencyModelPipeline),
- ("pixart", PixArtAlphaPipeline),
+ ("pixart-alpha", PixArtAlphaPipeline),
+ ("pixart-sigma", PixArtSigmaPipeline),
]
)
@@ -216,7 +217,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
```
Parameters:
- pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+ pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
Can be either:
- A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
@@ -233,9 +234,9 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
is not used.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -310,7 +311,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
"""
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
local_files_only = kwargs.pop("local_files_only", False)
@@ -489,7 +490,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
```
Parameters:
- pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+ pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
Can be either:
- A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
@@ -506,9 +507,9 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
is not used.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -583,7 +584,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
"""
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
local_files_only = kwargs.pop("local_files_only", False)
@@ -765,7 +766,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
```
Parameters:
- pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+ pretrained_model_or_path (`str` or `os.PathLike`, *optional*):
Can be either:
- A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
@@ -782,9 +783,9 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
is not used.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -859,7 +860,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
"""
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
local_files_only = kwargs.pop("local_files_only", False)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
index a5a0aaed0f2e..022f30d819d8 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -1169,15 +1169,16 @@ def __call__(
self._num_timesteps = len(timesteps)
# 6. Prepare latent variables
- latents = self.prepare_latents(
- image,
- latent_timestep,
- batch_size,
- num_images_per_prompt,
- prompt_embeds.dtype,
- device,
- generator,
- )
+ if latents is None:
+ latents = self.prepare_latents(
+ image,
+ latent_timestep,
+ batch_size,
+ num_images_per_prompt,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
index 18c4370b8025..b9c4e3c0032c 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -151,7 +151,12 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
class StableDiffusionXLControlNetInpaintPipeline(
- DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, IPAdapterMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ StableDiffusionXLLoraLoaderMixin,
+ FromSingleFileMixin,
+ IPAdapterMixin,
+ TextualInversionLoaderMixin,
):
r"""
Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -160,6 +165,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
The pipeline also inherits the following loading methods:
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
- [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
- [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
- [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
index d32e7d81649d..dfd3cc239b36 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -89,8 +89,8 @@
... variant="fp16",
... use_safetensors=True,
... torch_dtype=torch.float16,
- ... ).to("cuda")
- >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda")
+ ... )
+ >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
>>> pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
... "stabilityai/stable-diffusion-xl-base-1.0",
... controlnet=controlnet,
@@ -98,7 +98,7 @@
... variant="fp16",
... use_safetensors=True,
... torch_dtype=torch.float16,
- ... ).to("cuda")
+ ... )
>>> pipe.enable_model_cpu_offload()
@@ -1429,16 +1429,17 @@ def __call__(
self._num_timesteps = len(timesteps)
# 6. Prepare latent variables
- latents = self.prepare_latents(
- image,
- latent_timestep,
- batch_size,
- num_images_per_prompt,
- prompt_embeds.dtype,
- device,
- generator,
- True,
- )
+ if latents is None:
+ latents = self.prepare_latents(
+ image,
+ latent_timestep,
+ batch_size,
+ num_images_per_prompt,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ True,
+ )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
index 3c3bd526692d..c84caa1fad88 100644
--- a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
@@ -363,6 +363,7 @@ class conditioning with `class_embed_type` equal to `None`.
"""
_supports_gradient_checkpointing = True
+ _no_split_modules = ["BasicTransformerBlock", "ResnetBlockFlat", "CrossAttnUpBlockFlat"]
@register_to_config
def __init__(
diff --git a/src/diffusers/pipelines/dit/pipeline_dit.py b/src/diffusers/pipelines/dit/pipeline_dit.py
index 289ea496028d..a3ea90874a12 100644
--- a/src/diffusers/pipelines/dit/pipeline_dit.py
+++ b/src/diffusers/pipelines/dit/pipeline_dit.py
@@ -227,6 +227,9 @@ def __call__(
if output_type == "pil":
samples = self.numpy_to_pil(samples)
+ # Offload all models
+ self.maybe_free_model_hooks()
+
if not return_dict:
return (samples,)
diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
index 8957d7140ef1..fce694d1d0bd 100644
--- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
+++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
@@ -872,9 +872,10 @@ def __call__(
else self.scheduler.config.original_inference_steps
)
latent_timestep = timesteps[:1]
- latents = self.prepare_latents(
- image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
- )
+ if latents is None:
+ latents = self.prepare_latents(
+ image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+ )
bs = batch_size * num_images_per_prompt
# 6. Get Guidance Scale Embedding
diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
index cfab70926a4a..5ea7c2c14551 100644
--- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
@@ -1419,7 +1419,6 @@ def encode_image(self, image, dtype=None, height=None, width=None, resize_mode="
if needs_upcasting:
image = image.float()
self.upcast_vae()
- image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
x0 = self.vae.encode(image).latent_dist.mode()
x0 = x0.to(dtype)
diff --git a/src/diffusers/pipelines/pipeline_flax_utils.py b/src/diffusers/pipelines/pipeline_flax_utils.py
index b1035c1f2f42..7534149b559a 100644
--- a/src/diffusers/pipelines/pipeline_flax_utils.py
+++ b/src/diffusers/pipelines/pipeline_flax_utils.py
@@ -254,9 +254,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -316,7 +316,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
```
"""
cache_dir = kwargs.pop("cache_dir", None)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
local_files_only = kwargs.pop("local_files_only", False)
token = kwargs.pop("token", None)
diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py
index 15fb34e72d24..f7d9785043d1 100644
--- a/src/diffusers/pipelines/pipeline_loading_utils.py
+++ b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -435,7 +435,7 @@ def _load_empty_model(
return_unused_kwargs=True,
return_commit_hash=True,
force_download=kwargs.pop("force_download", False),
- resume_download=kwargs.pop("resume_download", False),
+ resume_download=kwargs.pop("resume_download", None),
proxies=kwargs.pop("proxies", None),
local_files_only=kwargs.pop("local_files_only", False),
token=kwargs.pop("token", None),
@@ -454,7 +454,7 @@ def _load_empty_model(
cached_folder,
subfolder=name,
force_download=kwargs.pop("force_download", False),
- resume_download=kwargs.pop("resume_download", False),
+ resume_download=kwargs.pop("resume_download", None),
proxies=kwargs.pop("proxies", None),
local_files_only=kwargs.pop("local_files_only", False),
token=kwargs.pop("token", None),
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 68433332546b..e5f822caa0ef 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -376,7 +376,11 @@ def module_is_sequentially_offloaded(module):
if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
return False
- return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.AlignDevicesHook)
+ return hasattr(module, "_hf_hook") and (
+ isinstance(module._hf_hook, accelerate.hooks.AlignDevicesHook)
+ or hasattr(module._hf_hook, "hooks")
+ and isinstance(module._hf_hook.hooks[0], accelerate.hooks.AlignDevicesHook)
+ )
def module_is_offloaded(module):
if not is_accelerate_available() or is_accelerate_version("<", "0.17.0.dev0"):
@@ -529,9 +533,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
is not used.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -621,7 +625,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
```
"""
cache_dir = kwargs.pop("cache_dir", None)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
force_download = kwargs.pop("force_download", False)
proxies = kwargs.pop("proxies", None)
local_files_only = kwargs.pop("local_files_only", None)
@@ -1005,8 +1009,7 @@ def remove_all_hooks(self):
"""
for _, model in self.components.items():
if isinstance(model, torch.nn.Module) and hasattr(model, "_hf_hook"):
- is_sequential_cpu_offload = isinstance(getattr(model, "_hf_hook"), accelerate.hooks.AlignDevicesHook)
- accelerate.hooks.remove_hook_from_module(model, recurse=is_sequential_cpu_offload)
+ accelerate.hooks.remove_hook_from_module(model, recurse=True)
self._all_hooks = []
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
@@ -1213,9 +1216,9 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -1268,7 +1271,7 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
"""
cache_dir = kwargs.pop("cache_dir", None)
- resume_download = kwargs.pop("resume_download", False)
+ resume_download = kwargs.pop("resume_download", None)
force_download = kwargs.pop("force_download", False)
proxies = kwargs.pop("proxies", None)
local_files_only = kwargs.pop("local_files_only", None)
diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
index c9efe594e398..22f0c507e5ee 100644
--- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
+++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
@@ -273,15 +273,6 @@ def __init__(
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
- # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py
- def mask_text_embeddings(self, emb, mask):
- if emb.shape[0] == 1:
- keep_index = mask.sum().item()
- return emb[:, :, :keep_index, :], keep_index
- else:
- masked_feature = emb * mask[:, None, :, None]
- return masked_feature, emb.shape[2]
-
# Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
def encode_prompt(
self,
diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py
index dadf41c176ea..e3dd3ddc0404 100644
--- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py
+++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py
@@ -199,16 +199,7 @@ def __init__(
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
- # copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.py
- def mask_text_embeddings(self, emb, mask):
- if emb.shape[0] == 1:
- keep_index = mask.sum().item()
- return emb[:, :, :keep_index, :], keep_index
- else:
- masked_feature = emb * mask[:, None, :, None]
- return masked_feature, emb.shape[2]
-
- # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
+ # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt
def encode_prompt(
self,
prompt: Union[str, List[str]],
@@ -369,7 +360,7 @@ def prepare_extra_step_kwargs(self, generator, eta):
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
- # copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.py
+ # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.check_inputs
def check_inputs(
self,
prompt,
@@ -462,7 +453,7 @@ def process(text: str):
return [process(t) for t in text]
- # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline._clean_caption
+ # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
def _clean_caption(self, caption):
caption = str(caption)
caption = ul.unquote_plus(caption)
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 02e32633cedb..700ca5db6f07 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -239,15 +239,15 @@ def __call__(
num_embeddings = self.prior.config.num_embeddings
embedding_dim = self.prior.config.embedding_dim
-
- latents = self.prepare_latents(
- (batch_size, num_embeddings * embedding_dim),
- image_embeds.dtype,
- device,
- generator,
- latents,
- self.scheduler,
- )
+ if latents is None:
+ latents = self.prepare_latents(
+ (batch_size, num_embeddings * embedding_dim),
+ image_embeds.dtype,
+ device,
+ generator,
+ latents,
+ self.scheduler,
+ )
# YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim)
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index 0eda32d333b9..8ce08aec66ca 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -113,7 +113,6 @@
from .pipeline_stable_diffusion import (
StableDiffusionPipeline,
StableDiffusionPipelineOutput,
- StableDiffusionSafetyChecker,
)
from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index f04a21ef4857..b4b1b885dd3c 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -557,7 +557,7 @@ def convert_ldm_unet_checkpoint(
paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
)
- output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+ output_block_list = {k: sorted(v) for k, v in sorted(output_block_list.items())}
if ["conv.bias", "conv.weight"] in output_block_list.values():
index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index de2767e23952..0bf5a92a4fcc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -172,6 +172,7 @@ def __call__(
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None,
+ ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -296,6 +297,8 @@ def __call__(
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
+ ip_adapter_image,
+ ip_adapter_image_embeds,
callback_on_step_end_tensor_inputs,
)
self._guidance_scale = guidance_scale
@@ -303,14 +306,6 @@ def __call__(
device = self._execution_device
- if ip_adapter_image is not None:
- output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
- image_embeds, negative_image_embeds = self.encode_image(
- ip_adapter_image, device, num_images_per_prompt, output_hidden_state
- )
- if self.do_classifier_free_guidance:
- image_embeds = torch.cat([image_embeds, negative_image_embeds, negative_image_embeds])
-
if image is None:
raise ValueError("`image` input cannot be undefined.")
@@ -335,6 +330,14 @@ def __call__(
negative_prompt_embeds=negative_prompt_embeds,
)
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+ image_embeds = self.prepare_ip_adapter_image_embeds(
+ ip_adapter_image,
+ ip_adapter_image_embeds,
+ device,
+ batch_size * num_images_per_prompt,
+ self.do_classifier_free_guidance,
+ )
# 3. Preprocess image
image = self.image_processor.preprocess(image)
@@ -635,6 +638,65 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state
return image_embeds, uncond_image_embeds
+ def prepare_ip_adapter_image_embeds(
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+ ):
+ if ip_adapter_image_embeds is None:
+ if not isinstance(ip_adapter_image, list):
+ ip_adapter_image = [ip_adapter_image]
+
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+ raise ValueError(
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+ )
+
+ image_embeds = []
+ for single_ip_adapter_image, image_proj_layer in zip(
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+ ):
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
+ single_ip_adapter_image, device, 1, output_hidden_state
+ )
+ single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+ single_negative_image_embeds = torch.stack(
+ [single_negative_image_embeds] * num_images_per_prompt, dim=0
+ )
+
+ if do_classifier_free_guidance:
+ single_image_embeds = torch.cat(
+ [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
+ )
+ single_image_embeds = single_image_embeds.to(device)
+
+ image_embeds.append(single_image_embeds)
+ else:
+ repeat_dims = [1]
+ image_embeds = []
+ for single_image_embeds in ip_adapter_image_embeds:
+ if do_classifier_free_guidance:
+ (
+ single_image_embeds,
+ single_negative_image_embeds,
+ single_negative_image_embeds,
+ ) = single_image_embeds.chunk(3)
+ single_image_embeds = single_image_embeds.repeat(
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+ )
+ single_negative_image_embeds = single_negative_image_embeds.repeat(
+ num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+ )
+ single_image_embeds = torch.cat(
+ [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
+ )
+ else:
+ single_image_embeds = single_image_embeds.repeat(
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+ )
+ image_embeds.append(single_image_embeds)
+
+ return image_embeds
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
@@ -687,6 +749,8 @@ def check_inputs(
negative_prompt=None,
prompt_embeds=None,
negative_prompt_embeds=None,
+ ip_adapter_image=None,
+ ip_adapter_image_embeds=None,
callback_on_step_end_tensor_inputs=None,
):
if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
@@ -728,6 +792,21 @@ def check_inputs(
f" {negative_prompt_embeds.shape}."
)
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+ raise ValueError(
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+ )
+
+ if ip_adapter_image_embeds is not None:
+ if not isinstance(ip_adapter_image_embeds, list):
+ raise ValueError(
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+ )
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+ raise ValueError(
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+ )
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
shape = (
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
index fe19b4de3127..134ec39effc5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -786,16 +786,17 @@ def __call__(
# 6. Prepare latent variables
num_channels_latents = self.unet.config.in_channels
- latents = self.prepare_latents(
- batch_size=batch_size,
- num_channels_latents=num_channels_latents,
- height=height,
- width=width,
- dtype=prompt_embeds.dtype,
- device=device,
- generator=generator,
- latents=latents,
- )
+ if latents is None:
+ latents = self.prepare_latents(
+ batch_size=batch_size,
+ num_channels_latents=num_channels_latents,
+ height=height,
+ width=width,
+ dtype=prompt_embeds.dtype,
+ device=device,
+ generator=generator,
+ latents=latents,
+ )
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py
index 6cc4d26f29b4..3e6dec3e0bff 100644
--- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py
+++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py
@@ -31,6 +31,7 @@ def cosine_distance(image_embeds, text_embeds):
class StableDiffusionSafetyChecker(PreTrainedModel):
config_class = CLIPConfig
+ main_input_name = "clip_input"
_no_split_modules = ["CLIPEncoderLayer"]
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index b72b19d5c1ef..b98ea279c1a2 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -1247,17 +1247,19 @@ def denoising_value_valid(dnv):
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
add_noise = True if self.denoising_start is None else False
+
# 6. Prepare latent variables
- latents = self.prepare_latents(
- image,
- latent_timestep,
- batch_size,
- num_images_per_prompt,
- prompt_embeds.dtype,
- device,
- generator,
- add_noise,
- )
+ if latents is None:
+ latents = self.prepare_latents(
+ image,
+ latent_timestep,
+ batch_size,
+ num_images_per_prompt,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ add_noise,
+ )
# 7. Prepare extra step kwargs.
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
index a2242bb099c5..d9380020b329 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -436,7 +436,6 @@ def prepare_extra_step_kwargs(self, generator, eta):
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.check_inputs
def check_inputs(
self,
prompt,
@@ -526,8 +525,8 @@ def prepare_image_latents(
# make sure the VAE is in float32 mode, as it overflows in float16
needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
if needs_upcasting:
+ image = image.float()
self.upcast_vae()
- image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
image_latents = retrieve_latents(self.vae.encode(image), sample_mode="argmax")
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/watermark.py b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
index 5b6e36d9f447..f457cdbdb1eb 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
@@ -28,9 +28,15 @@ def apply_watermark(self, images: torch.FloatTensor):
images = (255 * (images / 2 + 0.5)).cpu().permute(0, 2, 3, 1).float().numpy()
- images = [self.encoder.encode(image, "dwtDct") for image in images]
+ # Convert RGB to BGR, which is the channel order expected by the watermark encoder.
+ images = images[:, :, :, ::-1]
- images = torch.from_numpy(np.array(images)).permute(0, 3, 1, 2)
+ # Add watermark and convert BGR back to RGB
+ images = [self.encoder.encode(image, "dwtDct")[:, :, ::-1] for image in images]
+
+ images = np.array(images)
+
+ images = torch.from_numpy(images).permute(0, 3, 1, 2)
images = torch.clamp(2 * (images / 255 - 0.5), min=-1.0, max=1.0)
return images
diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index 070183b92409..da6832cebd4d 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -199,6 +199,9 @@ def _encode_vae_image(
image = image.to(device=device)
image_latents = self.vae.encode(image).latent_dist.mode()
+ # duplicate image_latents for each generation per prompt, using mps friendly method
+ image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+
if do_classifier_free_guidance:
negative_image_latents = torch.zeros_like(image_latents)
@@ -207,9 +210,6 @@ def _encode_vae_image(
# to avoid doing two forward passes
image_latents = torch.cat([negative_image_latents, image_latents])
- # duplicate image_latents for each generation per prompt, using mps friendly method
- image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
-
return image_latents
def _get_add_time_ids(
diff --git a/src/diffusers/schedulers/deprecated/__init__.py b/src/diffusers/schedulers/deprecated/__init__.py
index 786707f45206..479cf9bd568b 100644
--- a/src/diffusers/schedulers/deprecated/__init__.py
+++ b/src/diffusers/schedulers/deprecated/__init__.py
@@ -30,7 +30,7 @@
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
- from ..utils.dummy_pt_objects import * # noqa F403
+ from ...utils.dummy_pt_objects import * # noqa F403
else:
from .scheduling_karras_ve import KarrasVeScheduler
from .scheduling_sde_vp import ScoreSdeVpScheduler
diff --git a/src/diffusers/schedulers/scheduling_ddpm_flax.py b/src/diffusers/schedulers/scheduling_ddpm_flax.py
index 6bdfa5eb5e0d..d06a171159ee 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_flax.py
@@ -222,9 +222,13 @@ def step(
t = timestep
if key is None:
- key = jax.random.PRNGKey(0)
+ key = jax.random.key(0)
- if model_output.shape[1] == sample.shape[1] * 2 and self.config.variance_type in ["learned", "learned_range"]:
+ if (
+ len(model_output.shape) > 1
+ and model_output.shape[1] == sample.shape[1] * 2
+ and self.config.variance_type in ["learned", "learned_range"]
+ ):
model_output, predicted_variance = jnp.split(model_output, sample.shape[1], axis=1)
else:
predicted_variance = None
@@ -264,7 +268,7 @@ def step(
# 6. Add noise
def random_variance():
- split_key = jax.random.split(key, num=1)
+ split_key = jax.random.split(key, num=1)[0]
noise = jax.random.normal(split_key, shape=model_output.shape, dtype=self.dtype)
return (self._get_variance(state, t, predicted_variance=predicted_variance) ** 0.5) * noise
diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
index 26a41d7335c5..dfc7978a2ee2 100644
--- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
@@ -14,6 +14,7 @@
# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver and https://github.com/NVlabs/edm
+import math
from typing import List, Optional, Tuple, Union
import numpy as np
@@ -44,6 +45,10 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
range is [0.2, 80.0].
sigma_data (`float`, *optional*, defaults to 0.5):
The standard deviation of the data distribution. This is set to 0.5 in the EDM paper [1].
+ sigma_schedule (`str`, *optional*, defaults to `karras`):
+ Sigma schedule to compute the `sigmas`. By default, we the schedule introduced in the EDM paper
+ (https://arxiv.org/abs/2206.00364). Other acceptable value is "exponential". The exponential schedule was
+ incorporated in this model: https://huggingface.co/stabilityai/cosxl.
num_train_timesteps (`int`, defaults to 1000):
The number of diffusion steps to train the model.
solver_order (`int`, defaults to 2):
@@ -89,6 +94,7 @@ def __init__(
sigma_min: float = 0.002,
sigma_max: float = 80.0,
sigma_data: float = 0.5,
+ sigma_schedule: str = "karras",
num_train_timesteps: int = 1000,
prediction_type: str = "epsilon",
rho: float = 7.0,
@@ -121,7 +127,11 @@ def __init__(
)
ramp = torch.linspace(0, 1, num_train_timesteps)
- sigmas = self._compute_sigmas(ramp)
+ if sigma_schedule == "karras":
+ sigmas = self._compute_karras_sigmas(ramp)
+ elif sigma_schedule == "exponential":
+ sigmas = self._compute_exponential_sigmas(ramp)
+
self.timesteps = self.precondition_noise(sigmas)
self.sigmas = self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
@@ -236,7 +246,10 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
self.num_inference_steps = num_inference_steps
ramp = np.linspace(0, 1, self.num_inference_steps)
- sigmas = self._compute_sigmas(ramp)
+ if self.config.sigma_schedule == "karras":
+ sigmas = self._compute_karras_sigmas(ramp)
+ elif self.config.sigma_schedule == "exponential":
+ sigmas = self._compute_exponential_sigmas(ramp)
sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
self.timesteps = self.precondition_noise(sigmas)
@@ -262,10 +275,9 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
self._begin_index = None
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
- # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
- def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
+ # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas
+ def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
"""Constructs the noise schedule of Karras et al. (2022)."""
-
sigma_min = sigma_min or self.config.sigma_min
sigma_max = sigma_max or self.config.sigma_max
@@ -273,6 +285,18 @@ def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTe
min_inv_rho = sigma_min ** (1 / rho)
max_inv_rho = sigma_max ** (1 / rho)
sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+
+ return sigmas
+
+ # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_exponential_sigmas
+ def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
+ """Implementation closely follows k-diffusion.
+
+ https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26
+ """
+ sigma_min = sigma_min or self.config.sigma_min
+ sigma_max = sigma_max or self.config.sigma_max
+ sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), len(ramp)).exp().flip(0)
return sigmas
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py
index f6a09ca1ee16..0ef9263c9e30 100644
--- a/src/diffusers/schedulers/scheduling_edm_euler.py
+++ b/src/diffusers/schedulers/scheduling_edm_euler.py
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
@@ -65,6 +66,10 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
range is [0.2, 80.0].
sigma_data (`float`, *optional*, defaults to 0.5):
The standard deviation of the data distribution. This is set to 0.5 in the EDM paper [1].
+ sigma_schedule (`str`, *optional*, defaults to `karras`):
+ Sigma schedule to compute the `sigmas`. By default, we the schedule introduced in the EDM paper
+ (https://arxiv.org/abs/2206.00364). Other acceptable value is "exponential". The exponential schedule was
+ incorporated in this model: https://huggingface.co/stabilityai/cosxl.
num_train_timesteps (`int`, defaults to 1000):
The number of diffusion steps to train the model.
prediction_type (`str`, defaults to `epsilon`, *optional*):
@@ -84,15 +89,23 @@ def __init__(
sigma_min: float = 0.002,
sigma_max: float = 80.0,
sigma_data: float = 0.5,
+ sigma_schedule: str = "karras",
num_train_timesteps: int = 1000,
prediction_type: str = "epsilon",
rho: float = 7.0,
):
+ if sigma_schedule not in ["karras", "exponential"]:
+ raise ValueError(f"Wrong value for provided for `{sigma_schedule=}`.`")
+
# setable values
self.num_inference_steps = None
ramp = torch.linspace(0, 1, num_train_timesteps)
- sigmas = self._compute_sigmas(ramp)
+ if sigma_schedule == "karras":
+ sigmas = self._compute_karras_sigmas(ramp)
+ elif sigma_schedule == "exponential":
+ sigmas = self._compute_exponential_sigmas(ramp)
+
self.timesteps = self.precondition_noise(sigmas)
self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
@@ -200,7 +213,10 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
self.num_inference_steps = num_inference_steps
ramp = np.linspace(0, 1, self.num_inference_steps)
- sigmas = self._compute_sigmas(ramp)
+ if self.config.sigma_schedule == "karras":
+ sigmas = self._compute_karras_sigmas(ramp)
+ elif self.config.sigma_schedule == "exponential":
+ sigmas = self._compute_exponential_sigmas(ramp)
sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
self.timesteps = self.precondition_noise(sigmas)
@@ -211,9 +227,8 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
# Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
- def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
+ def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
"""Constructs the noise schedule of Karras et al. (2022)."""
-
sigma_min = sigma_min or self.config.sigma_min
sigma_max = sigma_max or self.config.sigma_max
@@ -221,6 +236,17 @@ def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTe
min_inv_rho = sigma_min ** (1 / rho)
max_inv_rho = sigma_max ** (1 / rho)
sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+
+ return sigmas
+
+ def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
+ """Implementation closely follows k-diffusion.
+
+ https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26
+ """
+ sigma_min = sigma_min or self.config.sigma_min
+ sigma_max = sigma_max or self.config.sigma_max
+ sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), len(ramp)).exp().flip(0)
return sigmas
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 1e3252c0bd39..be5bbc235878 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -576,5 +576,44 @@ def add_noise(
noisy_samples = original_samples + noise * sigma
return noisy_samples
+ def get_velocity(
+ self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.FloatTensor
+ ) -> torch.FloatTensor:
+ if (
+ isinstance(timesteps, int)
+ or isinstance(timesteps, torch.IntTensor)
+ or isinstance(timesteps, torch.LongTensor)
+ ):
+ raise ValueError(
+ (
+ "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+ " `EulerDiscreteScheduler.get_velocity()` is not supported. Make sure to pass"
+ " one of the `scheduler.timesteps` as a timestep."
+ ),
+ )
+
+ if sample.device.type == "mps" and torch.is_floating_point(timesteps):
+ # mps does not support float64
+ schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32)
+ timesteps = timesteps.to(sample.device, dtype=torch.float32)
+ else:
+ schedule_timesteps = self.timesteps.to(sample.device)
+ timesteps = timesteps.to(sample.device)
+
+ step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+ alphas_cumprod = self.alphas_cumprod.to(sample)
+ sqrt_alpha_prod = alphas_cumprod[step_indices] ** 0.5
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+ while len(sqrt_alpha_prod.shape) < len(sample.shape):
+ sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[step_indices]) ** 0.5
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+ while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+ velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+ return velocity
+
def __len__(self):
return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_utils.py b/src/diffusers/schedulers/scheduling_utils.py
index 5dbdb82884bc..dcdce6c51f05 100644
--- a/src/diffusers/schedulers/scheduling_utils.py
+++ b/src/diffusers/schedulers/scheduling_utils.py
@@ -112,9 +112,9 @@ def from_pretrained(
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
- incompletely downloaded files are deleted.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
diff --git a/src/diffusers/schedulers/scheduling_utils_flax.py b/src/diffusers/schedulers/scheduling_utils_flax.py
index a1d471f910e5..360ca4705e02 100644
--- a/src/diffusers/schedulers/scheduling_utils_flax.py
+++ b/src/diffusers/schedulers/scheduling_utils_flax.py
@@ -102,9 +102,9 @@ def from_pretrained(
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to delete incompletely received files. Will attempt to resume the download if such a
- file exists.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
diff --git a/src/diffusers/training_utils.py b/src/diffusers/training_utils.py
index 25e02a3d1492..b617dd2eef39 100644
--- a/src/diffusers/training_utils.py
+++ b/src/diffusers/training_utils.py
@@ -1,12 +1,13 @@
import contextlib
import copy
import random
-from typing import Any, Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
import torch
from .models import UNet2DConditionModel
+from .schedulers import SchedulerMixin
from .utils import (
convert_state_dict_to_diffusers,
convert_state_dict_to_peft,
@@ -117,6 +118,60 @@ def resolve_interpolation_mode(interpolation_type: str):
return interpolation_mode
+def compute_dream_and_update_latents(
+ unet: UNet2DConditionModel,
+ noise_scheduler: SchedulerMixin,
+ timesteps: torch.Tensor,
+ noise: torch.Tensor,
+ noisy_latents: torch.Tensor,
+ target: torch.Tensor,
+ encoder_hidden_states: torch.Tensor,
+ dream_detail_preservation: float = 1.0,
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+ """
+ Implements "DREAM (Diffusion Rectification and Estimation-Adaptive Models)" from http://arxiv.org/abs/2312.00210.
+ DREAM helps align training with sampling to help training be more efficient and accurate at the cost of an extra
+ forward step without gradients.
+
+ Args:
+ `unet`: The state unet to use to make a prediction.
+ `noise_scheduler`: The noise scheduler used to add noise for the given timestep.
+ `timesteps`: The timesteps for the noise_scheduler to user.
+ `noise`: A tensor of noise in the shape of noisy_latents.
+ `noisy_latents`: Previously noise latents from the training loop.
+ `target`: The ground-truth tensor to predict after eps is removed.
+ `encoder_hidden_states`: Text embeddings from the text model.
+ `dream_detail_preservation`: A float value that indicates detail preservation level.
+ See reference.
+
+ Returns:
+ `tuple[torch.Tensor, torch.Tensor]`: Adjusted noisy_latents and target.
+ """
+ alphas_cumprod = noise_scheduler.alphas_cumprod.to(timesteps.device)[timesteps, None, None, None]
+ sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+
+ # The paper uses lambda = sqrt(1 - alpha) ** p, with p = 1 in their experiments.
+ dream_lambda = sqrt_one_minus_alphas_cumprod**dream_detail_preservation
+
+ pred = None
+ with torch.no_grad():
+ pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+ noisy_latents, target = (None, None)
+ if noise_scheduler.config.prediction_type == "epsilon":
+ predicted_noise = pred
+ delta_noise = (noise - predicted_noise).detach()
+ delta_noise.mul_(dream_lambda)
+ noisy_latents = noisy_latents.add(sqrt_one_minus_alphas_cumprod * delta_noise)
+ target = target.add(delta_noise)
+ elif noise_scheduler.config.prediction_type == "v_prediction":
+ raise NotImplementedError("DREAM has not been implemented for v-prediction")
+ else:
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+ return noisy_latents, target
+
+
def unet_lora_state_dict(unet: UNet2DConditionModel) -> Dict[str, torch.Tensor]:
r"""
Returns:
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index b750b60491a4..0583cf839ff7 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -92,6 +92,21 @@ def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
+class AnimateDiffSDXLPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+
class AnimateDiffVideoToVideoPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
diff --git a/src/diffusers/utils/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py
index add95812122c..8df418f3fdd4 100644
--- a/src/diffusers/utils/dynamic_modules_utils.py
+++ b/src/diffusers/utils/dynamic_modules_utils.py
@@ -201,7 +201,7 @@ def get_cached_module_file(
module_file: str,
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
- resume_download: bool = False,
+ resume_download: Optional[bool] = None,
proxies: Optional[Dict[str, str]] = None,
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
@@ -228,9 +228,9 @@ def get_cached_module_file(
cache should not be used.
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the configuration files and override the cached versions if they
- exist.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+ exist. resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1
+ of Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
@@ -380,7 +380,7 @@ def get_class_from_dynamic_module(
class_name: Optional[str] = None,
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
- resume_download: bool = False,
+ resume_download: Optional[bool] = None,
proxies: Optional[Dict[str, str]] = None,
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
@@ -417,8 +417,9 @@ def get_class_from_dynamic_module(
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the configuration files and override the cached versions if they
exist.
- resume_download (`bool`, *optional*, defaults to `False`):
- Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+ resume_download:
+ Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 of
+ Diffusers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py
index d70ee53aaa41..83f02848fcf4 100644
--- a/src/diffusers/utils/hub_utils.py
+++ b/src/diffusers/utils/hub_utils.py
@@ -283,7 +283,7 @@ def _get_model_file(
cache_dir: Optional[str] = None,
force_download: bool = False,
proxies: Optional[Dict] = None,
- resume_download: bool = False,
+ resume_download: Optional[bool] = None,
local_files_only: bool = False,
token: Optional[str] = None,
user_agent: Optional[Union[Dict, str]] = None,
diff --git a/src/diffusers/utils/peft_utils.py b/src/diffusers/utils/peft_utils.py
index 8ea12e2e3b3f..ca55192ff7ae 100644
--- a/src/diffusers/utils/peft_utils.py
+++ b/src/diffusers/utils/peft_utils.py
@@ -246,7 +246,13 @@ def get_module_weight(weight_for_adapter, module_name):
for layer_name, weight_ in weight_for_adapter.items():
if layer_name in module_name:
return weight_
- raise RuntimeError(f"No LoRA weight found for module {module_name}.")
+
+ parts = module_name.split(".")
+ # e.g. key = "down_blocks.1.attentions.0"
+ key = f"{parts[0]}.{parts[1]}.attentions.{parts[3]}"
+ block_weight = weight_for_adapter.get(key, 1.0)
+
+ return block_weight
# iterate over each adapter, make it active and set the corresponding scaling weight
for adapter_name, weight in zip(adapter_names, weights):
diff --git a/tests/lora/test_lora_layers_sdxl.py b/tests/lora/test_lora_layers_sdxl.py
index b46b887d10fb..a8b2d2759f41 100644
--- a/tests/lora/test_lora_layers_sdxl.py
+++ b/tests/lora/test_lora_layers_sdxl.py
@@ -202,6 +202,36 @@ def test_sdxl_1_0_lora(self):
pipe.unload_lora_weights()
release_memory(pipe)
+ def test_sdxl_1_0_blockwise_lora(self):
+ generator = torch.Generator("cpu").manual_seed(0)
+
+ pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+ pipe.enable_model_cpu_offload()
+ lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+ lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+ pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, adapter_name="offset")
+ scales = {
+ "unet": {
+ "down": {"block_1": [1.0, 1.0], "block_2": [1.0, 1.0]},
+ "mid": 1.0,
+ "up": {"block_0": [1.0, 1.0, 1.0], "block_1": [1.0, 1.0, 1.0]},
+ },
+ }
+ pipe.set_adapters(["offset"], [scales])
+
+ images = pipe(
+ "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+ ).images
+
+ images = images[0, -3:, -3:, -1].flatten()
+ expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
+
+ max_diff = numpy_cosine_similarity_distance(expected, images)
+ assert max_diff < 1e-4
+
+ pipe.unload_lora_weights()
+ release_memory(pipe)
+
def test_sdxl_lcm_lora(self):
pipe = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index f919ba10fbb7..59369b509876 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -24,14 +24,20 @@
import numpy as np
import requests_mock
import torch
+from accelerate.utils import compute_module_sizes
from huggingface_hub import ModelCard, delete_repo
from huggingface_hub.utils import is_jinja_available
from requests.exceptions import HTTPError
from diffusers.models import UNet2DConditionModel
-from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0, XFormersAttnProcessor
+from diffusers.models.attention_processor import (
+ AttnProcessor,
+ AttnProcessor2_0,
+ AttnProcessorNPU,
+ XFormersAttnProcessor,
+)
from diffusers.training_utils import EMAModel
-from diffusers.utils import is_xformers_available, logging
+from diffusers.utils import is_torch_npu_available, is_xformers_available, logging
from diffusers.utils.testing_utils import (
CaptureLogger,
get_python_version,
@@ -39,6 +45,7 @@
require_torch_2,
require_torch_accelerator_with_training,
require_torch_gpu,
+ require_torch_multi_gpu,
run_test_in_subprocess,
torch_device,
)
@@ -200,6 +207,21 @@ class ModelTesterMixin:
main_input_name = None # overwrite in model specific tester class
base_precision = 1e-3
forward_requires_fresh_args = False
+ model_split_percents = [0.5, 0.7, 0.9]
+
+ def check_device_map_is_respected(self, model, device_map):
+ for param_name, param in model.named_parameters():
+ # Find device in device_map
+ while len(param_name) > 0 and param_name not in device_map:
+ param_name = ".".join(param_name.split(".")[:-1])
+ if param_name not in device_map:
+ raise ValueError("device map is incomplete, it does not contain any device for `param_name`.")
+
+ param_device = device_map[param_name]
+ if param_device in ["cpu", "disk"]:
+ self.assertEqual(param.device, torch.device("meta"))
+ else:
+ self.assertEqual(param.device, torch.device(param_device))
def test_from_save_pretrained(self, expected_max_diff=5e-5):
if self.forward_requires_fresh_args:
@@ -283,6 +305,53 @@ def test_getattr_is_correct(self):
assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'"
+ @unittest.skipIf(
+ torch_device != "npu" or not is_torch_npu_available(),
+ reason="torch npu flash attention is only available with NPU and `torch_npu` installed",
+ )
+ def test_set_torch_npu_flash_attn_processor_determinism(self):
+ torch.use_deterministic_algorithms(False)
+ if self.forward_requires_fresh_args:
+ model = self.model_class(**self.init_dict)
+ else:
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+ model = self.model_class(**init_dict)
+ model.to(torch_device)
+
+ if not hasattr(model, "set_attn_processor"):
+ # If not has `set_attn_processor`, skip test
+ return
+
+ model.set_default_attn_processor()
+ assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
+ with torch.no_grad():
+ if self.forward_requires_fresh_args:
+ output = model(**self.inputs_dict(0))[0]
+ else:
+ output = model(**inputs_dict)[0]
+
+ model.enable_npu_flash_attention()
+ assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
+ with torch.no_grad():
+ if self.forward_requires_fresh_args:
+ output_2 = model(**self.inputs_dict(0))[0]
+ else:
+ output_2 = model(**inputs_dict)[0]
+
+ model.set_attn_processor(AttnProcessorNPU())
+ assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
+ with torch.no_grad():
+ if self.forward_requires_fresh_args:
+ output_3 = model(**self.inputs_dict(0))[0]
+ else:
+ output_3 = model(**inputs_dict)[0]
+
+ torch.use_deterministic_algorithms(True)
+
+ assert torch.allclose(output, output_2, atol=self.base_precision)
+ assert torch.allclose(output, output_3, atol=self.base_precision)
+ assert torch.allclose(output_2, output_3, atol=self.base_precision)
+
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
@@ -670,6 +739,129 @@ def test_deprecated_kwargs(self):
" from `_deprecated_kwargs = []`"
)
+ @require_torch_gpu
+ def test_cpu_offload(self):
+ config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+ model = self.model_class(**config).eval()
+ if model._no_split_modules is None:
+ return
+
+ model = model.to(torch_device)
+
+ torch.manual_seed(0)
+ base_output = model(**inputs_dict)
+
+ model_size = compute_module_sizes(model)[""]
+ # We test several splits of sizes to make sure it works.
+ max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ model.cpu().save_pretrained(tmp_dir)
+
+ for max_size in max_gpu_sizes:
+ max_memory = {0: max_size, "cpu": model_size * 2}
+ new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
+ # Making sure part of the model will actually end up offloaded
+ self.assertSetEqual(set(new_model.hf_device_map.values()), {0, "cpu"})
+
+ self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+ torch.manual_seed(0)
+ new_output = new_model(**inputs_dict)
+
+ self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+ @require_torch_gpu
+ def test_disk_offload_without_safetensors(self):
+ config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+ model = self.model_class(**config).eval()
+ if model._no_split_modules is None:
+ return
+
+ model = model.to(torch_device)
+
+ torch.manual_seed(0)
+ base_output = model(**inputs_dict)
+
+ model_size = compute_module_sizes(model)[""]
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ model.cpu().save_pretrained(tmp_dir, safe_serialization=False)
+
+ with self.assertRaises(ValueError):
+ max_size = int(self.model_split_percents[0] * model_size)
+ max_memory = {0: max_size, "cpu": max_size}
+ # This errors out because it's missing an offload folder
+ new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
+
+ max_size = int(self.model_split_percents[0] * model_size)
+ max_memory = {0: max_size, "cpu": max_size}
+ new_model = self.model_class.from_pretrained(
+ tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir
+ )
+
+ self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+ torch.manual_seed(0)
+ new_output = new_model(**inputs_dict)
+
+ self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+ @require_torch_gpu
+ def test_disk_offload_with_safetensors(self):
+ config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+ model = self.model_class(**config).eval()
+ if model._no_split_modules is None:
+ return
+
+ model = model.to(torch_device)
+
+ torch.manual_seed(0)
+ base_output = model(**inputs_dict)
+
+ model_size = compute_module_sizes(model)[""]
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ model.cpu().save_pretrained(tmp_dir)
+
+ max_size = int(self.model_split_percents[0] * model_size)
+ max_memory = {0: max_size, "cpu": max_size}
+ new_model = self.model_class.from_pretrained(
+ tmp_dir, device_map="auto", offload_folder=tmp_dir, max_memory=max_memory
+ )
+
+ self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+ torch.manual_seed(0)
+ new_output = new_model(**inputs_dict)
+
+ self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+ @require_torch_multi_gpu
+ def test_model_parallelism(self):
+ config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+ model = self.model_class(**config).eval()
+ if model._no_split_modules is None:
+ return
+
+ model = model.to(torch_device)
+
+ torch.manual_seed(0)
+ base_output = model(**inputs_dict)
+
+ model_size = compute_module_sizes(model)[""]
+ # We test several splits of sizes to make sure it works.
+ max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ model.cpu().save_pretrained(tmp_dir)
+
+ for max_size in max_gpu_sizes:
+ max_memory = {0: max_size, 1: model_size * 2, "cpu": model_size * 2}
+ new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
+ # Making sure part of the model will actually end up offloaded
+ self.assertSetEqual(set(new_model.hf_device_map.values()), {0, 1})
+
+ self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+
+ torch.manual_seed(0)
+ new_output = new_model(**inputs_dict)
+
+ self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
@is_staging_test
class ModelPushToHubTester(unittest.TestCase):
diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py
index 1b8a998cfd66..33aa6a10377b 100644
--- a/tests/models/unets/test_models_unet_2d_condition.py
+++ b/tests/models/unets/test_models_unet_2d_condition.py
@@ -300,6 +300,8 @@ def create_custom_diffusion_layers(model, mock_weights: bool = True):
class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
model_class = UNet2DConditionModel
main_input_name = "sample"
+ # We override the items here because the unet under consideration is small.
+ model_split_percents = [0.5, 0.3, 0.4]
@property
def dummy_input(self):
diff --git a/tests/models/unets/test_models_unet_controlnetxs.py b/tests/models/unets/test_models_unet_controlnetxs.py
index 8c9b43a20ad6..6f3662e01750 100644
--- a/tests/models/unets/test_models_unet_controlnetxs.py
+++ b/tests/models/unets/test_models_unet_controlnetxs.py
@@ -22,11 +22,7 @@
from diffusers import ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel
from diffusers.utils import logging
-from diffusers.utils.testing_utils import (
- enable_full_determinism,
- floats_tensor,
- torch_device,
-)
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, is_flaky, torch_device
from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
@@ -305,6 +301,7 @@ def _set_gradient_checkpointing_new(self, module, value=False):
assert set(modules_with_gc_enabled.keys()) == EXPECTED_SET
assert all(modules_with_gc_enabled.values()), "All modules should be enabled"
+ @is_flaky
def test_forward_no_control(self):
unet = self.get_dummy_unet()
controlnet = self.get_dummy_controlnet_from_unet(unet)
diff --git a/tests/pipelines/amused/test_amused.py b/tests/pipelines/amused/test_amused.py
index f03751e2f830..9a9e2551d642 100644
--- a/tests/pipelines/amused/test_amused.py
+++ b/tests/pipelines/amused/test_amused.py
@@ -38,17 +38,17 @@ class AmusedPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
def get_dummy_components(self):
torch.manual_seed(0)
transformer = UVit2DModel(
- hidden_size=32,
+ hidden_size=8,
use_bias=False,
hidden_dropout=0.0,
- cond_embed_dim=32,
+ cond_embed_dim=8,
micro_cond_encode_dim=2,
micro_cond_embed_dim=10,
- encoder_hidden_size=32,
+ encoder_hidden_size=8,
vocab_size=32,
- codebook_size=32,
- in_channels=32,
- block_out_channels=32,
+ codebook_size=8,
+ in_channels=8,
+ block_out_channels=8,
num_res_blocks=1,
downsample=True,
upsample=True,
@@ -56,7 +56,7 @@ def get_dummy_components(self):
num_hidden_layers=1,
num_attention_heads=1,
attention_dropout=0.0,
- intermediate_size=32,
+ intermediate_size=8,
layer_norm_eps=1e-06,
ln_elementwise_affine=True,
)
@@ -64,17 +64,17 @@ def get_dummy_components(self):
torch.manual_seed(0)
vqvae = VQModel(
act_fn="silu",
- block_out_channels=[32],
+ block_out_channels=[8],
down_block_types=[
"DownEncoderBlock2D",
],
in_channels=3,
- latent_channels=32,
- layers_per_block=2,
- norm_num_groups=32,
- num_vq_embeddings=32,
+ latent_channels=8,
+ layers_per_block=1,
+ norm_num_groups=8,
+ num_vq_embeddings=8,
out_channels=3,
- sample_size=32,
+ sample_size=8,
up_block_types=[
"UpDecoderBlock2D",
],
@@ -85,14 +85,14 @@ def get_dummy_components(self):
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
- hidden_size=32,
- intermediate_size=64,
+ hidden_size=8,
+ intermediate_size=8,
layer_norm_eps=1e-05,
- num_attention_heads=8,
- num_hidden_layers=3,
+ num_attention_heads=1,
+ num_hidden_layers=1,
pad_token_id=1,
vocab_size=1000,
- projection_dim=32,
+ projection_dim=8,
)
text_encoder = CLIPTextModelWithProjection(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
diff --git a/tests/pipelines/amused/test_amused_img2img.py b/tests/pipelines/amused/test_amused_img2img.py
index efbca1f437a4..24bc34d330e9 100644
--- a/tests/pipelines/amused/test_amused_img2img.py
+++ b/tests/pipelines/amused/test_amused_img2img.py
@@ -42,17 +42,17 @@ class AmusedImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
def get_dummy_components(self):
torch.manual_seed(0)
transformer = UVit2DModel(
- hidden_size=32,
+ hidden_size=8,
use_bias=False,
hidden_dropout=0.0,
- cond_embed_dim=32,
+ cond_embed_dim=8,
micro_cond_encode_dim=2,
micro_cond_embed_dim=10,
- encoder_hidden_size=32,
+ encoder_hidden_size=8,
vocab_size=32,
- codebook_size=32,
- in_channels=32,
- block_out_channels=32,
+ codebook_size=8,
+ in_channels=8,
+ block_out_channels=8,
num_res_blocks=1,
downsample=True,
upsample=True,
@@ -60,7 +60,7 @@ def get_dummy_components(self):
num_hidden_layers=1,
num_attention_heads=1,
attention_dropout=0.0,
- intermediate_size=32,
+ intermediate_size=8,
layer_norm_eps=1e-06,
ln_elementwise_affine=True,
)
@@ -68,17 +68,17 @@ def get_dummy_components(self):
torch.manual_seed(0)
vqvae = VQModel(
act_fn="silu",
- block_out_channels=[32],
+ block_out_channels=[8],
down_block_types=[
"DownEncoderBlock2D",
],
in_channels=3,
- latent_channels=32,
- layers_per_block=2,
- norm_num_groups=32,
- num_vq_embeddings=32,
+ latent_channels=8,
+ layers_per_block=1,
+ norm_num_groups=8,
+ num_vq_embeddings=32, # reducing this to 16 or 8 -> RuntimeError: "cdist_cuda" not implemented for 'Half'
out_channels=3,
- sample_size=32,
+ sample_size=8,
up_block_types=[
"UpDecoderBlock2D",
],
@@ -89,14 +89,14 @@ def get_dummy_components(self):
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
- hidden_size=32,
- intermediate_size=64,
+ hidden_size=8,
+ intermediate_size=8,
layer_norm_eps=1e-05,
- num_attention_heads=8,
- num_hidden_layers=3,
+ num_attention_heads=1,
+ num_hidden_layers=1,
pad_token_id=1,
vocab_size=1000,
- projection_dim=32,
+ projection_dim=8,
)
text_encoder = CLIPTextModelWithProjection(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
diff --git a/tests/pipelines/amused/test_amused_inpaint.py b/tests/pipelines/amused/test_amused_inpaint.py
index d397f8d81297..d0c1ed09c706 100644
--- a/tests/pipelines/amused/test_amused_inpaint.py
+++ b/tests/pipelines/amused/test_amused_inpaint.py
@@ -42,17 +42,17 @@ class AmusedInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
def get_dummy_components(self):
torch.manual_seed(0)
transformer = UVit2DModel(
- hidden_size=32,
+ hidden_size=8,
use_bias=False,
hidden_dropout=0.0,
- cond_embed_dim=32,
+ cond_embed_dim=8,
micro_cond_encode_dim=2,
micro_cond_embed_dim=10,
- encoder_hidden_size=32,
+ encoder_hidden_size=8,
vocab_size=32,
- codebook_size=32,
- in_channels=32,
- block_out_channels=32,
+ codebook_size=32, # codebook size needs to be consistent with num_vq_embeddings for inpaint tests
+ in_channels=8,
+ block_out_channels=8,
num_res_blocks=1,
downsample=True,
upsample=True,
@@ -60,7 +60,7 @@ def get_dummy_components(self):
num_hidden_layers=1,
num_attention_heads=1,
attention_dropout=0.0,
- intermediate_size=32,
+ intermediate_size=8,
layer_norm_eps=1e-06,
ln_elementwise_affine=True,
)
@@ -68,17 +68,17 @@ def get_dummy_components(self):
torch.manual_seed(0)
vqvae = VQModel(
act_fn="silu",
- block_out_channels=[32],
+ block_out_channels=[8],
down_block_types=[
"DownEncoderBlock2D",
],
in_channels=3,
- latent_channels=32,
- layers_per_block=2,
- norm_num_groups=32,
- num_vq_embeddings=32,
+ latent_channels=8,
+ layers_per_block=1,
+ norm_num_groups=8,
+ num_vq_embeddings=32, # reducing this to 16 or 8 -> RuntimeError: "cdist_cuda" not implemented for 'Half'
out_channels=3,
- sample_size=32,
+ sample_size=8,
up_block_types=[
"UpDecoderBlock2D",
],
@@ -89,14 +89,14 @@ def get_dummy_components(self):
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
- hidden_size=32,
- intermediate_size=64,
+ hidden_size=8,
+ intermediate_size=8,
layer_norm_eps=1e-05,
- num_attention_heads=8,
- num_hidden_layers=3,
+ num_attention_heads=1,
+ num_hidden_layers=1,
pad_token_id=1,
vocab_size=1000,
- projection_dim=32,
+ projection_dim=8,
)
text_encoder = CLIPTextModelWithProjection(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
diff --git a/tests/pipelines/animatediff/test_animatediff_sdxl.py b/tests/pipelines/animatediff/test_animatediff_sdxl.py
new file mode 100644
index 000000000000..2db0139154e9
--- /dev/null
+++ b/tests/pipelines/animatediff/test_animatediff_sdxl.py
@@ -0,0 +1,307 @@
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+ AnimateDiffSDXLPipeline,
+ AutoencoderKL,
+ DDIMScheduler,
+ MotionAdapter,
+ UNet2DConditionModel,
+ UNetMotionModel,
+)
+from diffusers.utils import is_xformers_available, logging
+from diffusers.utils.testing_utils import torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import (
+ IPAdapterTesterMixin,
+ PipelineTesterMixin,
+ SDFunctionTesterMixin,
+ SDXLOptionalComponentsTesterMixin,
+)
+
+
+def to_np(tensor):
+ if isinstance(tensor, torch.Tensor):
+ tensor = tensor.detach().cpu().numpy()
+
+ return tensor
+
+
+class AnimateDiffPipelineSDXLFastTests(
+ IPAdapterTesterMixin,
+ SDFunctionTesterMixin,
+ PipelineTesterMixin,
+ SDXLOptionalComponentsTesterMixin,
+ unittest.TestCase,
+):
+ pipeline_class = AnimateDiffSDXLPipeline
+ params = TEXT_TO_IMAGE_PARAMS
+ batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+ required_optional_params = frozenset(
+ [
+ "num_inference_steps",
+ "generator",
+ "latents",
+ "return_dict",
+ "callback_on_step_end",
+ "callback_on_step_end_tensor_inputs",
+ ]
+ )
+ callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"})
+
+ def get_dummy_components(self, time_cond_proj_dim=None):
+ torch.manual_seed(0)
+ unet = UNet2DConditionModel(
+ block_out_channels=(32, 64, 128),
+ layers_per_block=2,
+ time_cond_proj_dim=time_cond_proj_dim,
+ sample_size=32,
+ in_channels=4,
+ out_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D"),
+ up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"),
+ # SD2-specific config below
+ attention_head_dim=(2, 4, 8),
+ use_linear_projection=True,
+ addition_embed_type="text_time",
+ addition_time_embed_dim=8,
+ transformer_layers_per_block=(1, 2, 4),
+ projection_class_embeddings_input_dim=80, # 6 * 8 + 32
+ cross_attention_dim=64,
+ norm_num_groups=1,
+ )
+ scheduler = DDIMScheduler(
+ beta_start=0.00085,
+ beta_end=0.012,
+ beta_schedule="linear",
+ clip_sample=False,
+ )
+ torch.manual_seed(0)
+ vae = AutoencoderKL(
+ block_out_channels=[32, 64],
+ in_channels=3,
+ out_channels=3,
+ down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+ up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+ latent_channels=4,
+ sample_size=128,
+ )
+ torch.manual_seed(0)
+ text_encoder_config = CLIPTextConfig(
+ bos_token_id=0,
+ eos_token_id=2,
+ hidden_size=32,
+ intermediate_size=37,
+ layer_norm_eps=1e-05,
+ num_attention_heads=4,
+ num_hidden_layers=5,
+ pad_token_id=1,
+ vocab_size=1000,
+ # SD2-specific config below
+ hidden_act="gelu",
+ projection_dim=32,
+ )
+ text_encoder = CLIPTextModel(text_encoder_config)
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+ text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+ tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+ motion_adapter = MotionAdapter(
+ block_out_channels=(32, 64, 128),
+ motion_layers_per_block=2,
+ motion_norm_num_groups=2,
+ motion_num_attention_heads=4,
+ use_motion_mid_block=False,
+ )
+
+ components = {
+ "unet": unet,
+ "scheduler": scheduler,
+ "vae": vae,
+ "motion_adapter": motion_adapter,
+ "text_encoder": text_encoder,
+ "tokenizer": tokenizer,
+ "text_encoder_2": text_encoder_2,
+ "tokenizer_2": tokenizer_2,
+ "feature_extractor": None,
+ "image_encoder": None,
+ }
+ return components
+
+ def get_dummy_inputs(self, device, seed=0):
+ if str(device).startswith("mps"):
+ generator = torch.manual_seed(seed)
+ else:
+ generator = torch.Generator(device=device).manual_seed(seed)
+
+ inputs = {
+ "prompt": "A painting of a squirrel eating a burger",
+ "generator": generator,
+ "num_inference_steps": 2,
+ "guidance_scale": 7.5,
+ "output_type": "np",
+ }
+ return inputs
+
+ def test_motion_unet_loading(self):
+ components = self.get_dummy_components()
+ pipe = AnimateDiffSDXLPipeline(**components)
+
+ assert isinstance(pipe.unet, UNetMotionModel)
+
+ @unittest.skip("Attention slicing is not enabled in this pipeline")
+ def test_attention_slicing_forward_pass(self):
+ pass
+
+ def test_inference_batch_single_identical(
+ self,
+ batch_size=2,
+ expected_max_diff=1e-4,
+ additional_params_copy_to_batched_inputs=["num_inference_steps"],
+ ):
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ for components in pipe.components.values():
+ if hasattr(components, "set_default_attn_processor"):
+ components.set_default_attn_processor()
+
+ pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+ inputs = self.get_dummy_inputs(torch_device)
+ # Reset generator in case it is has been used in self.get_dummy_inputs
+ inputs["generator"] = self.get_generator(0)
+
+ logger = logging.get_logger(pipe.__module__)
+ logger.setLevel(level=diffusers.logging.FATAL)
+
+ # batchify inputs
+ batched_inputs = {}
+ batched_inputs.update(inputs)
+
+ for name in self.batch_params:
+ if name not in inputs:
+ continue
+
+ value = inputs[name]
+ if name == "prompt":
+ len_prompt = len(value)
+ batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+ batched_inputs[name][-1] = 100 * "very long"
+
+ else:
+ batched_inputs[name] = batch_size * [value]
+
+ if "generator" in inputs:
+ batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+ if "batch_size" in inputs:
+ batched_inputs["batch_size"] = batch_size
+
+ for arg in additional_params_copy_to_batched_inputs:
+ batched_inputs[arg] = inputs[arg]
+
+ output = pipe(**inputs)
+ output_batch = pipe(**batched_inputs)
+
+ assert output_batch[0].shape[0] == batch_size
+
+ max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+ assert max_diff < expected_max_diff
+
+ @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+ def test_to_device(self):
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe.set_progress_bar_config(disable=None)
+
+ pipe.to("cpu")
+ # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
+ model_devices = [
+ component.device.type for component in pipe.components.values() if hasattr(component, "device")
+ ]
+ self.assertTrue(all(device == "cpu" for device in model_devices))
+
+ output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
+ self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+ pipe.to("cuda")
+ model_devices = [
+ component.device.type for component in pipe.components.values() if hasattr(component, "device")
+ ]
+ self.assertTrue(all(device == "cuda" for device in model_devices))
+
+ output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
+ self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+ def test_to_dtype(self):
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe.set_progress_bar_config(disable=None)
+
+ # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
+ model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+ self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+ pipe.to(dtype=torch.float16)
+ model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+ self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+ def test_prompt_embeds(self):
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe.set_progress_bar_config(disable=None)
+ pipe.to(torch_device)
+
+ inputs = self.get_dummy_inputs(torch_device)
+ prompt = inputs.pop("prompt")
+
+ (
+ prompt_embeds,
+ negative_prompt_embeds,
+ pooled_prompt_embeds,
+ negative_pooled_prompt_embeds,
+ ) = pipe.encode_prompt(prompt)
+
+ pipe(
+ **inputs,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ pooled_prompt_embeds=pooled_prompt_embeds,
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+ )
+
+ def test_save_load_optional_components(self):
+ self._test_save_load_optional_components()
+
+ @unittest.skipIf(
+ torch_device != "cuda" or not is_xformers_available(),
+ reason="XFormers attention is only available with CUDA and `xformers` installed",
+ )
+ def test_xformers_attention_forwardGenerator_pass(self):
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ for component in pipe.components.values():
+ if hasattr(component, "set_default_attn_processor"):
+ component.set_default_attn_processor()
+ pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(torch_device)
+ output_without_offload = pipe(**inputs).frames[0]
+ output_without_offload = (
+ output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
+ )
+
+ pipe.enable_xformers_memory_efficient_attention()
+ inputs = self.get_dummy_inputs(torch_device)
+ output_with_offload = pipe(**inputs).frames[0]
+ output_with_offload = (
+ output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
+ )
+
+ max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+ self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index f83dc8158e83..eddab54a3c03 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -66,16 +66,17 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
- block_out_channels=(32, 64),
- layers_per_block=2,
+ block_out_channels=(8, 16),
+ layers_per_block=1,
+ norm_num_groups=8,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=(32, 64),
+ cross_attention_dim=(8, 16),
class_embed_type="simple_projection",
- projection_class_embeddings_input_dim=32,
+ projection_class_embeddings_input_dim=8,
class_embeddings_concat=True,
)
scheduler = DDIMScheduler(
@@ -87,9 +88,10 @@ def get_dummy_components(self):
)
torch.manual_seed(0)
vae = AutoencoderKL(
- block_out_channels=[32, 64],
+ block_out_channels=[8, 16],
in_channels=1,
out_channels=1,
+ norm_num_groups=8,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
@@ -98,14 +100,14 @@ def get_dummy_components(self):
text_encoder_config = ClapTextConfig(
bos_token_id=0,
eos_token_id=2,
- hidden_size=32,
+ hidden_size=8,
intermediate_size=37,
layer_norm_eps=1e-05,
- num_attention_heads=4,
- num_hidden_layers=5,
+ num_attention_heads=1,
+ num_hidden_layers=1,
pad_token_id=1,
vocab_size=1000,
- projection_dim=32,
+ projection_dim=8,
)
text_encoder = ClapTextModelWithProjection(text_encoder_config)
tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
diff --git a/tests/pipelines/blipdiffusion/test_blipdiffusion.py b/tests/pipelines/blipdiffusion/test_blipdiffusion.py
index c5eaa3883d09..7e85cef65129 100644
--- a/tests/pipelines/blipdiffusion/test_blipdiffusion.py
+++ b/tests/pipelines/blipdiffusion/test_blipdiffusion.py
@@ -64,9 +64,9 @@ def get_dummy_components(self):
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
vocab_size=1000,
- hidden_size=16,
- intermediate_size=16,
- projection_dim=16,
+ hidden_size=8,
+ intermediate_size=8,
+ projection_dim=8,
num_hidden_layers=1,
num_attention_heads=1,
max_position_embeddings=77,
@@ -78,17 +78,17 @@ def get_dummy_components(self):
out_channels=4,
down_block_types=("DownEncoderBlock2D",),
up_block_types=("UpDecoderBlock2D",),
- block_out_channels=(32,),
+ block_out_channels=(8,),
+ norm_num_groups=8,
layers_per_block=1,
act_fn="silu",
latent_channels=4,
- norm_num_groups=16,
- sample_size=16,
+ sample_size=8,
)
blip_vision_config = {
- "hidden_size": 16,
- "intermediate_size": 16,
+ "hidden_size": 8,
+ "intermediate_size": 8,
"num_hidden_layers": 1,
"num_attention_heads": 1,
"image_size": 224,
@@ -98,32 +98,32 @@ def get_dummy_components(self):
blip_qformer_config = {
"vocab_size": 1000,
- "hidden_size": 16,
+ "hidden_size": 8,
"num_hidden_layers": 1,
"num_attention_heads": 1,
- "intermediate_size": 16,
+ "intermediate_size": 8,
"max_position_embeddings": 512,
"cross_attention_frequency": 1,
- "encoder_hidden_size": 16,
+ "encoder_hidden_size": 8,
}
qformer_config = Blip2Config(
vision_config=blip_vision_config,
qformer_config=blip_qformer_config,
- num_query_tokens=16,
+ num_query_tokens=8,
tokenizer="hf-internal-testing/tiny-random-bert",
)
qformer = Blip2QFormerModel(qformer_config)
unet = UNet2DConditionModel(
- block_out_channels=(16, 32),
- norm_num_groups=16,
+ block_out_channels=(8, 16),
+ norm_num_groups=8,
layers_per_block=1,
sample_size=16,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=16,
+ cross_attention_dim=8,
)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
@@ -189,7 +189,9 @@ def test_blipdiffusion(self):
assert image.shape == (1, 16, 16, 4)
- expected_slice = np.array([0.7096, 0.5900, 0.6703, 0.4032, 0.7766, 0.3629, 0.5447, 0.4149, 0.8172])
+ expected_slice = np.array(
+ [0.5329548, 0.8372512, 0.33269387, 0.82096875, 0.43657133, 0.3783, 0.5953028, 0.51934963, 0.42142007]
+ )
assert (
np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py
index 0f0654397a34..2078a592ceca 100644
--- a/tests/pipelines/ddim/test_ddim.py
+++ b/tests/pipelines/ddim/test_ddim.py
@@ -42,9 +42,10 @@ class DDIMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DModel(
- block_out_channels=(32, 64),
- layers_per_block=2,
- sample_size=32,
+ block_out_channels=(4, 8),
+ layers_per_block=1,
+ norm_num_groups=4,
+ sample_size=8,
in_channels=3,
out_channels=3,
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
@@ -79,10 +80,8 @@ def test_inference(self):
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
- self.assertEqual(image.shape, (1, 32, 32, 3))
- expected_slice = np.array(
- [1.000e00, 5.717e-01, 4.717e-01, 1.000e00, 0.000e00, 1.000e00, 3.000e-04, 0.000e00, 9.000e-04]
- )
+ self.assertEqual(image.shape, (1, 8, 8, 3))
+ expected_slice = np.array([0.0, 9.979e-01, 0.0, 9.999e-01, 9.986e-01, 9.991e-01, 7.106e-04, 0.0, 0.0])
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 1e-3)
diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py
index c0cce3a2f237..f6d0821da4c2 100644
--- a/tests/pipelines/ddpm/test_ddpm.py
+++ b/tests/pipelines/ddpm/test_ddpm.py
@@ -30,9 +30,10 @@ class DDPMPipelineFastTests(unittest.TestCase):
def dummy_uncond_unet(self):
torch.manual_seed(0)
model = UNet2DModel(
- block_out_channels=(32, 64),
- layers_per_block=2,
- sample_size=32,
+ block_out_channels=(4, 8),
+ layers_per_block=1,
+ norm_num_groups=4,
+ sample_size=8,
in_channels=3,
out_channels=3,
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
@@ -58,10 +59,8 @@ def test_fast_inference(self):
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
- assert image.shape == (1, 32, 32, 3)
- expected_slice = np.array(
- [9.956e-01, 5.785e-01, 4.675e-01, 9.930e-01, 0.0, 1.000, 1.199e-03, 2.648e-04, 5.101e-04]
- )
+ assert image.shape == (1, 8, 8, 3)
+ expected_slice = np.array([0.0, 0.9996672, 0.00329116, 1.0, 0.9995991, 1.0, 0.0060907, 0.00115037, 0.0])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -83,7 +82,7 @@ def test_inference_predict_sample(self):
image_slice = image[0, -3:, -3:, -1]
image_eps_slice = image_eps[0, -3:, -3:, -1]
- assert image.shape == (1, 32, 32, 3)
+ assert image.shape == (1, 8, 8, 3)
tolerance = 1e-2 if torch_device != "mps" else 3e-2
assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance
diff --git a/tests/pipelines/pixart_alpha/test_pixart.py b/tests/pipelines/pixart_alpha/test_pixart.py
index d981b55260c7..dd358af08395 100644
--- a/tests/pipelines/pixart_alpha/test_pixart.py
+++ b/tests/pipelines/pixart_alpha/test_pixart.py
@@ -324,10 +324,6 @@ def test_raises_warning_for_mask_feature(self):
def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical(expected_max_diff=1e-3)
- # PixArt transformer model does not work with sequential offload so skip it for now
- def test_sequential_offload_forward_pass_twice(self):
- pass
-
@slow
@require_torch_gpu
diff --git a/tests/pipelines/pixart_sigma/test_pixart.py b/tests/pipelines/pixart_sigma/test_pixart.py
index 7b1d5e389f32..c0df15ae661d 100644
--- a/tests/pipelines/pixart_sigma/test_pixart.py
+++ b/tests/pipelines/pixart_sigma/test_pixart.py
@@ -308,10 +308,6 @@ def test_inference_with_multiple_images_per_prompt(self):
def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical(expected_max_diff=1e-3)
- # PixArt transformer model does not work with sequential offload so skip it for now
- def test_sequential_offload_forward_pass_twice(self):
- pass
-
@slow
@require_torch_gpu
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 145e0012f8e9..137f1e696d93 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -1257,8 +1257,8 @@ def tearDown(self):
def test_download_from_hub(self):
ckpt_paths = [
- "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt",
- "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix_base.ckpt",
+ "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
+ "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors",
]
for ckpt_path in ckpt_paths:
@@ -1271,7 +1271,7 @@ def test_download_from_hub(self):
assert image_out.shape == (512, 512, 3)
def test_download_local(self):
- ckpt_filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.ckpt")
+ ckpt_filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.safetensors")
config_filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-inference.yaml")
pipe = StableDiffusionPipeline.from_single_file(
@@ -1285,7 +1285,7 @@ def test_download_local(self):
assert image_out.shape == (512, 512, 3)
def test_download_ckpt_diff_format_is_same(self):
- ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt"
+ ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
sf_pipe = StableDiffusionPipeline.from_single_file(ckpt_path)
sf_pipe.scheduler = DDIMScheduler.from_config(sf_pipe.scheduler.config)
@@ -1310,7 +1310,7 @@ def test_download_ckpt_diff_format_is_same(self):
def test_single_file_component_configs(self):
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
- ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt"
+ ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors"
single_file_pipe = StableDiffusionPipeline.from_single_file(ckpt_path, load_safety_checker=True)
for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items():
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 0c0a765f662d..c4b7a3b9187a 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1360,6 +1360,8 @@ def _test_attention_slicing_forward_pass(
reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
)
def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
+ import accelerate
+
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
for component in pipe.components.values():
@@ -1373,6 +1375,7 @@ def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
output_without_offload = pipe(**inputs)[0]
pipe.enable_sequential_cpu_offload()
+ assert pipe._execution_device.type == "cuda"
inputs = self.get_dummy_inputs(generator_device)
output_with_offload = pipe(**inputs)[0]
@@ -1380,11 +1383,48 @@ def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
+ # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
+ offloaded_modules = {
+ k: v
+ for k, v in pipe.components.items()
+ if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
+ }
+ # 1. all offloaded modules should be saved to cpu and moved to meta device
+ self.assertTrue(
+ all(v.device.type == "meta" for v in offloaded_modules.values()),
+ f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'meta']}",
+ )
+ # 2. all offloaded modules should have hook installed
+ self.assertTrue(
+ all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
+ f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
+ )
+ # 3. all offloaded modules should have correct hooks installed, should be either one of these two
+ # - `AlignDevicesHook`
+ # - a SequentialHook` that contains `AlignDevicesHook`
+ offloaded_modules_with_incorrect_hooks = {}
+ for k, v in offloaded_modules.items():
+ if hasattr(v, "_hf_hook"):
+ if isinstance(v._hf_hook, accelerate.hooks.SequentialHook):
+ # if it is a `SequentialHook`, we loop through its `hooks` attribute to check if it only contains `AlignDevicesHook`
+ for hook in v._hf_hook.hooks:
+ if not isinstance(hook, accelerate.hooks.AlignDevicesHook):
+ offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook.hooks[0])
+ elif not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook):
+ offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
+
+ self.assertTrue(
+ len(offloaded_modules_with_incorrect_hooks) == 0,
+ f"Not installed correct hook: {offloaded_modules_with_incorrect_hooks}",
+ )
+
@unittest.skipIf(
torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
)
def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
+ import accelerate
+
generator_device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
@@ -1400,19 +1440,39 @@ def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
output_without_offload = pipe(**inputs)[0]
pipe.enable_model_cpu_offload()
+ assert pipe._execution_device.type == "cuda"
+
inputs = self.get_dummy_inputs(generator_device)
output_with_offload = pipe(**inputs)[0]
max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
- offloaded_modules = [
- v
+
+ # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
+ offloaded_modules = {
+ k: v
for k, v in pipe.components.items()
if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
- ]
- (
- self.assertTrue(all(v.device.type == "cpu" for v in offloaded_modules)),
- f"Not offloaded: {[v for v in offloaded_modules if v.device.type != 'cpu']}",
+ }
+ # 1. check if all offloaded modules are saved to cpu
+ self.assertTrue(
+ all(v.device.type == "cpu" for v in offloaded_modules.values()),
+ f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'cpu']}",
+ )
+ # 2. check if all offloaded modules have hooks installed
+ self.assertTrue(
+ all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
+ f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
+ )
+ # 3. check if all offloaded modules have correct type of hooks installed, should be `CpuOffload`
+ offloaded_modules_with_incorrect_hooks = {}
+ for k, v in offloaded_modules.items():
+ if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.CpuOffload):
+ offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
+
+ self.assertTrue(
+ len(offloaded_modules_with_incorrect_hooks) == 0,
+ f"Not installed correct hook: {offloaded_modules_with_incorrect_hooks}",
)
@unittest.skipIf(
@@ -1444,16 +1504,24 @@ def test_cpu_offload_forward_pass_twice(self, expected_max_diff=2e-4):
self.assertLess(
max_diff, expected_max_diff, "running CPU offloading 2nd time should not affect the inference results"
)
+
+ # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
offloaded_modules = {
k: v
for k, v in pipe.components.items()
if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
}
+ # 1. check if all offloaded modules are saved to cpu
self.assertTrue(
all(v.device.type == "cpu" for v in offloaded_modules.values()),
f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'cpu']}",
)
-
+ # 2. check if all offloaded modules have hooks installed
+ self.assertTrue(
+ all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
+ f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
+ )
+ # 3. check if all offloaded modules have correct type of hooks installed, should be `CpuOffload`
offloaded_modules_with_incorrect_hooks = {}
for k, v in offloaded_modules.items():
if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.CpuOffload):
@@ -1493,19 +1561,36 @@ def test_sequential_offload_forward_pass_twice(self, expected_max_diff=2e-4):
self.assertLess(
max_diff, expected_max_diff, "running sequential offloading second time should have the inference results"
)
+
+ # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly
offloaded_modules = {
k: v
for k, v in pipe.components.items()
if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
}
+ # 1. check if all offloaded modules are moved to meta device
self.assertTrue(
all(v.device.type == "meta" for v in offloaded_modules.values()),
f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'meta']}",
)
+ # 2. check if all offloaded modules have hook installed
+ self.assertTrue(
+ all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()),
+ f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}",
+ )
+ # 3. check if all offloaded modules have correct hooks installed, should be either one of these two
+ # - `AlignDevicesHook`
+ # - a SequentialHook` that contains `AlignDevicesHook`
offloaded_modules_with_incorrect_hooks = {}
for k, v in offloaded_modules.items():
- if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook):
- offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
+ if hasattr(v, "_hf_hook"):
+ if isinstance(v._hf_hook, accelerate.hooks.SequentialHook):
+ # if it is a `SequentialHook`, we loop through its `hooks` attribute to check if it only contains `AlignDevicesHook`
+ for hook in v._hf_hook.hooks:
+ if not isinstance(hook, accelerate.hooks.AlignDevicesHook):
+ offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook.hooks[0])
+ elif not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook):
+ offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook)
self.assertTrue(
len(offloaded_modules_with_incorrect_hooks) == 0,