diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml
index 3314c2c1cfb4..ae5f36e5bd2e 100644
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -116,6 +116,7 @@ jobs:
run:
shell: bash
strategy:
+ fail-fast: false
max-parallel: 2
matrix:
module: [models, schedulers, lora, others, single_file, examples]
@@ -290,64 +291,118 @@ jobs:
pip install slack_sdk tabulate
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
- run_nightly_tests_apple_m1:
- name: Nightly PyTorch MPS tests on MacOS
- runs-on: [ self-hosted, apple-m1 ]
- if: github.event_name == 'schedule'
-
- steps:
- - name: Checkout diffusers
- uses: actions/checkout@v3
- with:
- fetch-depth: 2
-
- - name: Clean checkout
- shell: arch -arch arm64 bash {0}
- run: |
- git clean -fxd
-
- - name: Setup miniconda
- uses: ./.github/actions/setup-miniconda
- with:
- python-version: 3.9
-
- - name: Install dependencies
- shell: arch -arch arm64 bash {0}
- run: |
- ${CONDA_RUN} python -m pip install --upgrade pip uv
- ${CONDA_RUN} python -m uv pip install -e [quality,test]
- ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
- ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
- ${CONDA_RUN} python -m uv pip install pytest-reportlog
-
- - name: Environment
- shell: arch -arch arm64 bash {0}
- run: |
- ${CONDA_RUN} python utils/print_env.py
-
- - name: Run nightly PyTorch tests on M1 (MPS)
- shell: arch -arch arm64 bash {0}
- env:
- HF_HOME: /System/Volumes/Data/mnt/cache
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
- --report-log=tests_torch_mps.log \
- tests/
-
- - name: Failure short reports
- if: ${{ failure() }}
- run: cat reports/tests_torch_mps_failures_short.txt
-
- - name: Test suite reports artifacts
- if: ${{ always() }}
- uses: actions/upload-artifact@v2
- with:
- name: torch_mps_test_reports
- path: reports
-
- - name: Generate Report and Notify Channel
- if: always()
- run: |
- pip install slack_sdk tabulate
- python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+# M1 runner currently not well supported
+# TODO: (Dhruv) add these back when we setup better testing for Apple Silicon
+# run_nightly_tests_apple_m1:
+# name: Nightly PyTorch MPS tests on MacOS
+# runs-on: [ self-hosted, apple-m1 ]
+# if: github.event_name == 'schedule'
+#
+# steps:
+# - name: Checkout diffusers
+# uses: actions/checkout@v3
+# with:
+# fetch-depth: 2
+#
+# - name: Clean checkout
+# shell: arch -arch arm64 bash {0}
+# run: |
+# git clean -fxd
+# - name: Setup miniconda
+# uses: ./.github/actions/setup-miniconda
+# with:
+# python-version: 3.9
+#
+# - name: Install dependencies
+# shell: arch -arch arm64 bash {0}
+# run: |
+# ${CONDA_RUN} python -m pip install --upgrade pip uv
+# ${CONDA_RUN} python -m uv pip install -e [quality,test]
+# ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+# ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
+# ${CONDA_RUN} python -m uv pip install pytest-reportlog
+# - name: Environment
+# shell: arch -arch arm64 bash {0}
+# run: |
+# ${CONDA_RUN} python utils/print_env.py
+# - name: Run nightly PyTorch tests on M1 (MPS)
+# shell: arch -arch arm64 bash {0}
+# env:
+# HF_HOME: /System/Volumes/Data/mnt/cache
+# HF_TOKEN: ${{ secrets.HF_TOKEN }}
+# run: |
+# ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
+# --report-log=tests_torch_mps.log \
+# tests/
+# - name: Failure short reports
+# if: ${{ failure() }}
+# run: cat reports/tests_torch_mps_failures_short.txt
+#
+# - name: Test suite reports artifacts
+# if: ${{ always() }}
+# uses: actions/upload-artifact@v2
+# with:
+# name: torch_mps_test_reports
+# path: reports
+#
+# - name: Generate Report and Notify Channel
+# if: always()
+# run: |
+# pip install slack_sdk tabulate
+# python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_nightly_tests_apple_m1:
+# name: Nightly PyTorch MPS tests on MacOS
+# runs-on: [ self-hosted, apple-m1 ]
+# if: github.event_name == 'schedule'
+#
+# steps:
+# - name: Checkout diffusers
+# uses: actions/checkout@v3
+# with:
+# fetch-depth: 2
+#
+# - name: Clean checkout
+# shell: arch -arch arm64 bash {0}
+# run: |
+# git clean -fxd
+# - name: Setup miniconda
+# uses: ./.github/actions/setup-miniconda
+# with:
+# python-version: 3.9
+#
+# - name: Install dependencies
+# shell: arch -arch arm64 bash {0}
+# run: |
+# ${CONDA_RUN} python -m pip install --upgrade pip uv
+# ${CONDA_RUN} python -m uv pip install -e [quality,test]
+# ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+# ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
+# ${CONDA_RUN} python -m uv pip install pytest-reportlog
+# - name: Environment
+# shell: arch -arch arm64 bash {0}
+# run: |
+# ${CONDA_RUN} python utils/print_env.py
+# - name: Run nightly PyTorch tests on M1 (MPS)
+# shell: arch -arch arm64 bash {0}
+# env:
+# HF_HOME: /System/Volumes/Data/mnt/cache
+# HF_TOKEN: ${{ secrets.HF_TOKEN }}
+# run: |
+# ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
+# --report-log=tests_torch_mps.log \
+# tests/
+# - name: Failure short reports
+# if: ${{ failure() }}
+# run: cat reports/tests_torch_mps_failures_short.txt
+#
+# - name: Test suite reports artifacts
+# if: ${{ always() }}
+# uses: actions/upload-artifact@v2
+# with:
+# name: torch_mps_test_reports
+# path: reports
+#
+# - name: Generate Report and Notify Channel
+# if: always()
+# run: |
+# pip install slack_sdk tabulate
+# python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
\ No newline at end of file
diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 49a5eb47c02b..5cba056c7cba 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -1,9 +1,11 @@
-name: Slow Tests on main
+name: Fast GPU Tests on main
on:
push:
branches:
- main
+ - "v*.*.*-release"
+ - "v*.*.*-patch"
paths:
- "src/diffusers/**.py"
- "examples/**.py"
@@ -112,6 +114,8 @@ jobs:
run:
shell: bash
strategy:
+ fail-fast: false
+ max-parallel: 2
matrix:
module: [models, schedulers, lora, others, single_file]
steps:
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0aa2a77dbcac..049d317599ad 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -57,7 +57,7 @@ Any question or comment related to the Diffusers library can be asked on the [di
- ...
Every question that is asked on the forum or on Discord actively encourages the community to publicly
-share knowledge and might very well help a beginner in the future that has the same question you're
+share knowledge and might very well help a beginner in the future who has the same question you're
having. Please do pose any questions you might have.
In the same spirit, you are of immense help to the community by answering such questions because this way you are publicly documenting knowledge for everybody to learn from.
@@ -503,4 +503,4 @@ $ git push --set-upstream origin your-branch-for-syncing
### Style guide
-For documentation strings, ๐งจ Diffusers follows the [Google style](https://google.github.io/styleguide/pyguide.html).
\ No newline at end of file
+For documentation strings, ๐งจ Diffusers follows the [Google style](https://google.github.io/styleguide/pyguide.html).
diff --git a/PHILOSOPHY.md b/PHILOSOPHY.md
index 9e25b4cc6d2e..e85182ab46fe 100644
--- a/PHILOSOPHY.md
+++ b/PHILOSOPHY.md
@@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License.
๐งจ Diffusers provides **state-of-the-art** pretrained diffusion models across multiple modalities.
Its purpose is to serve as a **modular toolbox** for both inference and training.
-We aim at building a library that stands the test of time and therefore take API design very seriously.
+We aim to build a library that stands the test of time and therefore take API design very seriously.
In a nutshell, Diffusers is built to be a natural extension of PyTorch. Therefore, most of our design choices are based on [PyTorch's Design Principles](https://pytorch.org/docs/stable/community/design.html#pytorch-design-philosophy). Let's go over the most important ones:
@@ -107,4 +107,4 @@ The following design principles are followed:
- Every scheduler exposes the timesteps to be "looped over" via a `timesteps` attribute, which is an array of timesteps the model will be called upon.
- The `step(...)` function takes a predicted model output and the "current" sample (x_t) and returns the "previous", slightly more denoised sample (x_t-1).
- Given the complexity of diffusion schedulers, the `step` function does not expose all the complexity and can be a bit of a "black box".
-- In almost all cases, novel schedulers shall be implemented in a new scheduling file.
\ No newline at end of file
+- In almost all cases, novel schedulers shall be implemented in a new scheduling file.
diff --git a/docs/source/en/api/pipelines/cogvideox.md b/docs/source/en/api/pipelines/cogvideox.md
index 549666e60ebc..c7340eff40c4 100644
--- a/docs/source/en/api/pipelines/cogvideox.md
+++ b/docs/source/en/api/pipelines/cogvideox.md
@@ -29,6 +29,10 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.m
This pipeline was contributed by [zRzRzRzRzRzRzR](https://github.com/zRzRzRzRzRzRzR). The original codebase can be found [here](https://huggingface.co/THUDM). The original weights can be found under [hf.co/THUDM](https://huggingface.co/THUDM).
+There are two models available that can be used with the CogVideoX pipeline:
+- [`THUDM/CogVideoX-2b`](https://huggingface.co/THUDM/CogVideoX-2b)
+- [`THUDM/CogVideoX-5b`](https://huggingface.co/THUDM/CogVideoX-5b)
+
## Inference
Use [`torch.compile`](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile) to reduce the inference latency.
@@ -68,7 +72,7 @@ With torch.compile(): Average inference time: 76.27 seconds.
### Memory optimization
-CogVideoX requires about 19 GB of GPU memory to decode 49 frames (6 seconds of video at 8 FPS) with output resolution 720x480 (W x H), which makes it not possible to run on consumer GPUs or free-tier T4 Colab. The following memory optimizations could be used to reduce the memory footprint. For replication, you can refer to [this](https://gist.github.com/a-r-r-o-w/3959a03f15be5c9bd1fe545b09dfcc93) script.
+CogVideoX-2b requires about 19 GB of GPU memory to decode 49 frames (6 seconds of video at 8 FPS) with output resolution 720x480 (W x H), which makes it not possible to run on consumer GPUs or free-tier T4 Colab. The following memory optimizations could be used to reduce the memory footprint. For replication, you can refer to [this](https://gist.github.com/a-r-r-o-w/3959a03f15be5c9bd1fe545b09dfcc93) script.
- `pipe.enable_model_cpu_offload()`:
- Without enabling cpu offloading, memory usage is `33 GB`
diff --git a/docs/source/en/api/pipelines/controlnet_sd3.md b/docs/source/en/api/pipelines/controlnet_sd3.md
index e55d210005bf..bb91a43cbaef 100644
--- a/docs/source/en/api/pipelines/controlnet_sd3.md
+++ b/docs/source/en/api/pipelines/controlnet_sd3.md
@@ -22,7 +22,7 @@ The abstract from the paper is:
*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
-This controlnet code is mainly implemented by [The InstantX Team](https://huggingface.co/InstantX). The inpainting-related code was developed by [The Alimama Creative Team](https://huggingface.co/alimama-creative). You can find pre-trained checkpoints for SD3-ControlNet in the table below:
+This controlnet code is mainly implemented by [The InstantX Team](https://huggingface.co/InstantX). The inpainting-related code was developed by [The Alimama Creative Team](https://huggingface.co/alimama-creative). You can find pre-trained checkpoints for SD3-ControlNet in the table below:
| ControlNet type | Developer | Link |
diff --git a/docs/source/en/api/pipelines/kolors.md b/docs/source/en/api/pipelines/kolors.md
index a35a29d8a061..dce277942855 100644
--- a/docs/source/en/api/pipelines/kolors.md
+++ b/docs/source/en/api/pipelines/kolors.md
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/kolors/kolors_header_collage.png)
-Kolors is a large-scale text-to-image generation model based on latent diffusion, developed by [the Kuaishou Kolors team](kwai-kolors@kuaishou.com). Trained on billions of text-image pairs, Kolors exhibits significant advantages over both open-source and closed-source models in visual quality, complex semantic accuracy, and text rendering for both Chinese and English characters. Furthermore, Kolors supports both Chinese and English inputs, demonstrating strong performance in understanding and generating Chinese-specific content. For more details, please refer to this [technical report](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf).
+Kolors is a large-scale text-to-image generation model based on latent diffusion, developed by [the Kuaishou Kolors team](https://github.com/Kwai-Kolors/Kolors). Trained on billions of text-image pairs, Kolors exhibits significant advantages over both open-source and closed-source models in visual quality, complex semantic accuracy, and text rendering for both Chinese and English characters. Furthermore, Kolors supports both Chinese and English inputs, demonstrating strong performance in understanding and generating Chinese-specific content. For more details, please refer to this [technical report](https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf).
The abstract from the technical report is:
@@ -74,7 +74,7 @@ image_encoder = CLIPVisionModelWithProjection.from_pretrained(
pipe = KolorsPipeline.from_pretrained(
"Kwai-Kolors/Kolors-diffusers", image_encoder=image_encoder, torch_dtype=torch.float16, variant="fp16"
-).to("cuda")
+)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, use_karras_sigmas=True)
pipe.load_ip_adapter(
diff --git a/docs/source/en/api/pipelines/pag.md b/docs/source/en/api/pipelines/pag.md
index e601d1029622..aa69598ae290 100644
--- a/docs/source/en/api/pipelines/pag.md
+++ b/docs/source/en/api/pipelines/pag.md
@@ -20,7 +20,7 @@ The abstract from the paper is:
*Recent studies have demonstrated that diffusion models are capable of generating high-quality samples, but their quality heavily depends on sampling guidance techniques, such as classifier guidance (CG) and classifier-free guidance (CFG). These techniques are often not applicable in unconditional generation or in various downstream tasks such as image restoration. In this paper, we propose a novel sampling guidance, called Perturbed-Attention Guidance (PAG), which improves diffusion sample quality across both unconditional and conditional settings, achieving this without requiring additional training or the integration of external modules. PAG is designed to progressively enhance the structure of samples throughout the denoising process. It involves generating intermediate samples with degraded structure by substituting selected self-attention maps in diffusion U-Net with an identity matrix, by considering the self-attention mechanisms' ability to capture structural information, and guiding the denoising process away from these degraded samples. In both ADM and Stable Diffusion, PAG surprisingly improves sample quality in conditional and even unconditional scenarios. Moreover, PAG significantly improves the baseline performance in various downstream tasks where existing guidances such as CG or CFG cannot be fully utilized, including ControlNet with empty prompts and image restoration such as inpainting and deblurring.*
-PAG can be used by specifying the `pag_applied_layers` as a parameter when instantiating a PAG pipeline. It can be a single string or a list of strings. Each string can be a unique layer identifier or a regular expression to identify one or more layers.
+PAG can be used by specifying the `pag_applied_layers` as a parameter when instantiating a PAG pipeline. It can be a single string or a list of strings. Each string can be a unique layer identifier or a regular expression to identify one or more layers.
- Full identifier as a normal string: `down_blocks.2.attentions.0.transformer_blocks.0.attn1.processor`
- Full identifier as a RegEx: `down_blocks.2.(attentions|motion_modules).0.transformer_blocks.0.attn1.processor`
@@ -46,7 +46,7 @@ Since RegEx is supported as a way for matching layer identifiers, it is crucial
## KolorsPAGPipeline
[[autodoc]] KolorsPAGPipeline
- all
- - __call__
+ - __call__
## StableDiffusionPAGPipeline
[[autodoc]] StableDiffusionPAGPipeline
@@ -78,6 +78,10 @@ Since RegEx is supported as a way for matching layer identifiers, it is crucial
- all
- __call__
+## StableDiffusionXLControlNetPAGImg2ImgPipeline
+[[autodoc]] StableDiffusionXLControlNetPAGImg2ImgPipeline
+ - all
+ - __call__
## StableDiffusion3PAGPipeline
[[autodoc]] StableDiffusion3PAGPipeline
diff --git a/docs/source/en/stable_diffusion.md b/docs/source/en/stable_diffusion.md
index db4953ebbffd..64d8f7df0b1f 100644
--- a/docs/source/en/stable_diffusion.md
+++ b/docs/source/en/stable_diffusion.md
@@ -238,7 +238,7 @@ Pretty impressive! Let's tweak the second image - corresponding to the `Generato
```python
prompts = [
"portrait photo of the oldest warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3 --beta --upbeta",
- "portrait photo of a old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3 --beta --upbeta",
+ "portrait photo of an old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3 --beta --upbeta",
"portrait photo of a warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3 --beta --upbeta",
"portrait photo of a young warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3 --beta --upbeta",
]
diff --git a/docs/source/ko/conceptual/contribution.md b/docs/source/ko/conceptual/contribution.md
index f6d77f1952bf..bcb17c820d0e 100644
--- a/docs/source/ko/conceptual/contribution.md
+++ b/docs/source/ko/conceptual/contribution.md
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
specific language governing permissions and limitations under the License.
-->
-# Diffusers์ ๊ธฐ์ฌํ๋ ๋ฐฉ๋ฒ ๐งจ
+# Diffusers์ ๊ธฐ์ฌํ๋ ๋ฐฉ๋ฒ ๐งจ [[how-to-contribute-to-diffusers-]]
์คํ ์์ค ์ปค๋ฎค๋ํฐ์์์ ๊ธฐ์ฌ๋ฅผ ํ์ํฉ๋๋ค! ๋๊ตฌ๋ ์ฐธ์ฌํ ์ ์์ผ๋ฉฐ, ์ฝ๋๋ฟ๋ง ์๋๋ผ ์ง๋ฌธ์ ๋ต๋ณํ๊ฑฐ๋ ๋ฌธ์๋ฅผ ๊ฐ์ ํ๋ ๋ฑ ๋ชจ๋ ์ ํ์ ์ฐธ์ฌ๊ฐ ๊ฐ์น ์๊ณ ๊ฐ์ฌํ ์ฌ๊ฒจ์ง๋๋ค. ์ง๋ฌธ์ ๋ต๋ณํ๊ณ ๋ค๋ฅธ ์ฌ๋๋ค์ ๋์์ฃผ๋ฉฐ ์ํตํ๊ณ ๋ฌธ์๋ฅผ ๊ฐ์ ํ๋ ๊ฒ์ ๋ชจ๋ ์ปค๋ฎค๋ํฐ์๊ฒ ํฐ ๋์์ด ๋ฉ๋๋ค. ๋ฐ๋ผ์ ๊ด์ฌ์ด ์๋ค๋ฉด ๋๋ ค์ํ์ง ๋ง๊ณ ์ฐธ์ฌํด๋ณด์ธ์!
@@ -18,9 +18,9 @@ specific language governing permissions and limitations under the License.
์ด๋ค ๋ฐฉ์์ผ๋ก๋ ๊ธฐ์ฌํ๋ ค๋ ๊ฒฝ์ฐ, ์ฐ๋ฆฌ๋ ๊ฐ๋ฐฉ์ ์ด๊ณ ํ์ํ๋ฉฐ ์น๊ทผํ ์ปค๋ฎค๋ํฐ์ ์ผ๋ถ๊ฐ ๋๊ธฐ ์ํด ๋
ธ๋ ฅํ๊ณ ์์ต๋๋ค. ์ฐ๋ฆฌ์ [ํ๋ ๊ฐ๋ น](https://github.com/huggingface/diffusers/blob/main/CODE_OF_CONDUCT.md)์ ์ฝ๊ณ ์ํธ ์์ฉ ์ค์ ์ด๋ฅผ ์กด์คํ๋๋ก ์ฃผ์ํด์ฃผ์๊ธฐ ๋ฐ๋๋๋ค. ๋ํ ํ๋ก์ ํธ๋ฅผ ์๋ดํ๋ [์ค๋ฆฌ ์ง์นจ](https://huggingface.co/docs/diffusers/conceptual/ethical_guidelines)์ ์ต์ํด์ง๊ณ ๋์ผํ ํฌ๋ช
์ฑ๊ณผ ์ฑ
์์ฑ์ ์์น์ ์ค์ํด์ฃผ์๊ธฐ๋ฅผ ๋ถํ๋๋ฆฝ๋๋ค.
-์ฐ๋ฆฌ๋ ์ปค๋ฎค๋ํฐ๋ก๋ถํฐ์ ํผ๋๋ฐฑ์ ๋งค์ฐ ์ค์ํ๊ฒ ์๊ฐํ๋ฏ๋ก, ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ฅผ ๊ฐ์ ํ๋ ๋ฐ ๋์์ด ๋ ๊ฐ์น ์๋ ํผ๋๋ฐฑ์ด ์๋ค๊ณ ์๊ฐ๋๋ฉด ๋ง์ค์ด์ง ๋ง๊ณ ์๊ฒฌ์ ์ ์ํด์ฃผ์ธ์ - ๋ชจ๋ ๋ฉ์์ง, ๋๊ธ, ์ด์, ํ ๋ฆฌํ์คํธ(PR)๋ ์ฝํ๊ณ ๊ณ ๋ ค๋ฉ๋๋ค.
+์ฐ๋ฆฌ๋ ์ปค๋ฎค๋ํฐ๋ก๋ถํฐ์ ํผ๋๋ฐฑ์ ๋งค์ฐ ์ค์ํ๊ฒ ์๊ฐํ๋ฏ๋ก, ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ฅผ ๊ฐ์ ํ๋ ๋ฐ ๋์์ด ๋ ๊ฐ์น ์๋ ํผ๋๋ฐฑ์ด ์๋ค๊ณ ์๊ฐ๋๋ฉด ๋ง์ค์ด์ง ๋ง๊ณ ์๊ฒฌ์ ์ ์ํด์ฃผ์ธ์ - ๋ชจ๋ ๋ฉ์์ง, ๋๊ธ, ์ด์, Pull Request(PR)๋ ์ฝํ๊ณ ๊ณ ๋ ค๋ฉ๋๋ค.
-## ๊ฐ์
+## ๊ฐ์ [[overview]]
์ด์์ ์๋ ์ง๋ฌธ์ ๋ต๋ณํ๋ ๊ฒ์์๋ถํฐ ์ฝ์ด ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ์๋ก์ด diffusion ๋ชจ๋ธ์ ์ถ๊ฐํ๋ ๊ฒ๊น์ง ๋ค์ํ ๋ฐฉ๋ฒ์ผ๋ก ๊ธฐ์ฌ๋ฅผ ํ ์ ์์ต๋๋ค.
@@ -38,9 +38,9 @@ specific language governing permissions and limitations under the License.
์์ ๋งํ ๋๋ก, **๋ชจ๋ ๊ธฐ์ฌ๋ ์ปค๋ฎค๋ํฐ์๊ฒ ๊ฐ์น๊ฐ ์์ต๋๋ค**. ์ด์ด์ง๋ ๋ถ๋ถ์์ ๊ฐ ๊ธฐ์ฌ์ ๋ํด ์กฐ๊ธ ๋ ์์ธํ ์ค๋ช
ํ๊ฒ ์ต๋๋ค.
-4๋ถํฐ 9๊น์ง์ ๋ชจ๋ ๊ธฐ์ฌ์๋ PR์ ์ด์ด์ผ ํฉ๋๋ค. [PR์ ์ด๊ธฐ](#how-to-open-a-pr)์์ ์์ธํ ์ค๋ช
๋์ด ์์ต๋๋ค.
+4๋ถํฐ 9๊น์ง์ ๋ชจ๋ ๊ธฐ์ฌ์๋ Pull Request์ ์ด์ด์ผ ํฉ๋๋ค. [Pull Request ์ด๊ธฐ](#how-to-open-a-pr)์์ ์์ธํ ์ค๋ช
๋์ด ์์ต๋๋ค.
-### 1. Diffusers ํ ๋ก ํฌ๋ผ์ด๋ Diffusers Discord์์ ์ง๋ฌธํ๊ณ ๋ต๋ณํ๊ธฐ
+### 1. Diffusers ํ ๋ก ํฌ๋ผ์ด๋ Diffusers Discord์์ ์ง๋ฌธํ๊ณ ๋ต๋ณํ๊ธฐ [[1-asking-and-answering-questions-on-the-diffusers-discussion-forum-or-on-the-diffusers-discord]]
Diffusers ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ๊ด๋ จ๋ ๋ชจ๋ ์ง๋ฌธ์ด๋ ์๊ฒฌ์ [ํ ๋ก ํฌ๋ผ](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63)์ด๋ [Discord](https://discord.gg/G7tWnz98XR)์์ ํ ์ ์์ต๋๋ค. ์ด๋ฌํ ์ง๋ฌธ๊ณผ ์๊ฒฌ์๋ ๋ค์๊ณผ ๊ฐ์ ๋ด์ฉ์ด ํฌํจ๋ฉ๋๋ค(ํ์ง๋ง ์ด์ ๊ตญํ๋์ง๋ ์์ต๋๋ค):
- ์ง์์ ๊ณต์ ํ๊ธฐ ์ํด์ ํ๋ จ ๋๋ ์ถ๋ก ์คํ์ ๋ํ ๊ฒฐ๊ณผ ๋ณด๊ณ
@@ -54,7 +54,7 @@ Diffusers ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ๊ด๋ จ๋ ๋ชจ๋ ์ง๋ฌธ์ด๋ ์๊ฒฌ์ [ํ ๋ก ํฌ
- Diffusion ๋ชจ๋ธ์ ๋ํ ์ค๋ฆฌ์ ์ง๋ฌธ
- ...
-ํฌ๋ผ์ด๋ Discord์์ ์ง๋ฌธ์ ํ๋ฉด ์ปค๋ฎค๋ํฐ๊ฐ ์ง์์ ๊ณต๊ฐ์ ์ผ๋ก ๊ณต์ ํ๋๋ก ์ฅ๋ ค๋๋ฉฐ, ๋ฏธ๋์ ๋์ผํ ์ง๋ฌธ์ ๊ฐ์ง ์ด๋ณด์์๊ฒ๋ ๋์์ด ๋ ์ ์์ต๋๋ค. ๋ฐ๋ผ์ ๊ถ๊ธํ ์ง๋ฌธ์ ์ธ์ ๋ ์ง ํ์๊ธฐ ๋ฐ๋๋๋ค.
+ํฌ๋ผ์ด๋ Discord์์ ์ง๋ฌธ์ ํ๋ฉด ์ปค๋ฎค๋ํฐ๊ฐ ์ง์์ ๊ณต๊ฐ์ ์ผ๋ก ๊ณต์ ํ๋๋ก ์ฅ๋ ค๋๋ฉฐ, ํฅํ ๋์ผํ ์ง๋ฌธ์ ๊ฐ์ง ์ด๋ณด์์๊ฒ๋ ๋์์ด ๋ ์ ์์ต๋๋ค. ๋ฐ๋ผ์ ๊ถ๊ธํ ์ง๋ฌธ์ ์ธ์ ๋ ์ง ํ์๊ธฐ ๋ฐ๋๋๋ค.
๋ํ, ์ด๋ฌํ ์ง๋ฌธ์ ๋ต๋ณํ๋ ๊ฒ์ ์ปค๋ฎค๋ํฐ์๊ฒ ๋งค์ฐ ํฐ ๋์์ด ๋ฉ๋๋ค. ์๋ํ๋ฉด ์ด๋ ๊ฒ ํ๋ฉด ๋ชจ๋๊ฐ ํ์ตํ ์ ์๋ ๊ณต๊ฐ์ ์ธ ์ง์์ ๋ฌธ์ํํ๊ธฐ ๋๋ฌธ์
๋๋ค.
**์ฃผ์**ํ์ญ์์ค. ์ง๋ฌธ์ด๋ ๋ต๋ณ์ ํฌ์ํ๋ ๋
ธ๋ ฅ์ด ๋ง์์๋ก ๊ณต๊ฐ์ ์ผ๋ก ๋ฌธ์ํ๋ ์ง์์ ํ์ง์ด ๋์์ง๋๋ค. ๋ง์ฐฌ๊ฐ์ง๋ก, ์ ์ ์๋๊ณ ์ ๋ต๋ณ๋ ์ง๋ฌธ์ ๋ชจ๋์๊ฒ ์ ๊ทผ ๊ฐ๋ฅํ ๊ณ ํ์ง ์ง์ ๋ฐ์ดํฐ๋ฒ ์ด์ค๋ฅผ ๋ง๋ค์ด์ค๋๋ค. ๋ฐ๋ฉด์ ์๋ชป๋ ์ง๋ฌธ์ด๋ ๋ต๋ณ์ ๊ณต๊ฐ ์ง์ ๋ฐ์ดํฐ๋ฒ ์ด์ค์ ์ ๋ฐ์ ์ธ ํ์ง์ ๋ฎ์ถ ์ ์์ต๋๋ค.
@@ -64,9 +64,9 @@ Diffusers ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ๊ด๋ จ๋ ๋ชจ๋ ์ง๋ฌธ์ด๋ ์๊ฒฌ์ [ํ ๋ก ํฌ
[*ํฌ๋ผ*](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63)์ ๊ตฌ๊ธ๊ณผ ๊ฐ์ ๊ฒ์ ์์ง์์ ๋ ์ ์์ธํ๋ฉ๋๋ค. ๊ฒ์๋ฌผ์ ์ธ๊ธฐ์ ๋ฐ๋ผ ์์๊ฐ ๋งค๊ฒจ์ง๋ฉฐ, ์๊ฐ์์ผ๋ก ์ ๋ ฌ๋์ง ์์ต๋๋ค. ๋ฐ๋ผ์ ์ด์ ์ ๊ฒ์ํ ์ง๋ฌธ๊ณผ ๋ต๋ณ์ ์ฝ๊ฒ ์ฐพ์ ์ ์์ต๋๋ค.
๋ํ, ํฌ๋ผ์ ๊ฒ์๋ ์ง๋ฌธ๊ณผ ๋ต๋ณ์ ์ฝ๊ฒ ๋งํฌํ ์ ์์ต๋๋ค.
๋ฐ๋ฉด *Discord*๋ ์ฑํ
ํ์์ผ๋ก ๋์ด ์์ด ๋น ๋ฅธ ๋ํ๋ฅผ ์ ๋ํฉ๋๋ค.
-์ง๋ฌธ์ ๋ํ ๋ต๋ณ์ ๋น ๋ฅด๊ฒ ๋ฐ์ ์๋ ์๊ฒ ์ง๋ง, ์๊ฐ์ด ์ง๋๋ฉด ์ง๋ฌธ์ด ๋ ์ด์ ๋ณด์ด์ง ์์ต๋๋ค. ๋ํ, Discord์์ ์ด์ ์ ๊ฒ์๋ ์ ๋ณด๋ฅผ ์ฐพ๋ ๊ฒ์ ํจ์ฌ ์ด๋ ต์ต๋๋ค. ๋ฐ๋ผ์ ํฌ๋ผ์ ์ฌ์ฉํ์ฌ ๊ณ ํ์ง์ ์ง๋ฌธ๊ณผ ๋ต๋ณ์ ํ์ฌ ์ปค๋ฎค๋ํฐ๋ฅผ ์ํ ์ค๋ ์ง์๋๋ ์ง์์ ๋ง๋ค๊ธฐ๋ฅผ ๊ถ์ฅํฉ๋๋ค. Discord์์์ ํ ๋ก ์ด ๋งค์ฐ ํฅ๋ฏธ๋ก์ด ๋ต๋ณ๊ณผ ๊ฒฐ๋ก ์ ์ด๋์ด๋ด๋ ๊ฒฝ์ฐ, ํด๋น ์ ๋ณด๋ฅผ ํฌ๋ผ์ ๊ฒ์ํ์ฌ ๋ฏธ๋ ๋
์๋ค์๊ฒ ๋ ์ฝ๊ฒ ์ก์ธ์คํ ์ ์๋๋ก ๊ถ์ฅํฉ๋๋ค.
+์ง๋ฌธ์ ๋ํ ๋ต๋ณ์ ๋น ๋ฅด๊ฒ ๋ฐ์ ์๋ ์๊ฒ ์ง๋ง, ์๊ฐ์ด ์ง๋๋ฉด ์ง๋ฌธ์ด ๋ ์ด์ ๋ณด์ด์ง ์์ต๋๋ค. ๋ํ, Discord์์ ์ด์ ์ ๊ฒ์๋ ์ ๋ณด๋ฅผ ์ฐพ๋ ๊ฒ์ ํจ์ฌ ์ด๋ ต์ต๋๋ค. ๋ฐ๋ผ์ ํฌ๋ผ์ ์ฌ์ฉํ์ฌ ๊ณ ํ์ง์ ์ง๋ฌธ๊ณผ ๋ต๋ณ์ ํ์ฌ ์ปค๋ฎค๋ํฐ๋ฅผ ์ํ ์ค๋ ์ง์๋๋ ์ง์์ ๋ง๋ค๊ธฐ๋ฅผ ๊ถ์ฅํฉ๋๋ค. Discord์์์ ํ ๋ก ์ด ๋งค์ฐ ํฅ๋ฏธ๋ก์ด ๋ต๋ณ๊ณผ ๊ฒฐ๋ก ์ ์ด๋์ด๋ด๋ ๊ฒฝ์ฐ, ํด๋น ์ ๋ณด๋ฅผ ํฌ๋ผ์ ๊ฒ์ํ์ฌ ํฅํ ๋
์๋ค์๊ฒ ๋ ์ฝ๊ฒ ์ก์ธ์คํ ์ ์๋๋ก ๊ถ์ฅํฉ๋๋ค.
-### 2. GitHub ์ด์ ํญ์์ ์๋ก์ด ์ด์ ์ด๊ธฐ
+### 2. GitHub ์ด์ ํญ์์ ์๋ก์ด ์ด์ ์ด๊ธฐ [[2-opening-new-issues-on-the-github-issues-tab]]
๐งจ Diffusers ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ ์ฌ์ฉ์๋ค์ด ๋ง์ฃผ์น๋ ๋ฌธ์ ๋ฅผ ์๋ ค์ฃผ๋ ๋๋ถ์ ๊ฒฌ๊ณ ํ๊ณ ์ ๋ขฐํ ์ ์์ต๋๋ค. ๋ฐ๋ผ์ ์ด์๋ฅผ ๋ณด๊ณ ํด์ฃผ์
์ ๊ฐ์ฌํฉ๋๋ค.
@@ -81,53 +81,52 @@ Diffusers ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ๊ด๋ จ๋ ๋ชจ๋ ์ง๋ฌธ์ด๋ ์๊ฒฌ์ [ํ ๋ก ํฌ
- ์ด์๊ฐ ์ต์ Diffusers ๋ฒ์ ์ผ๋ก ์
๋ฐ์ดํธํ๋ฉด ํด๊ฒฐ๋ ์ ์๋์ง ํ์ธํด์ฃผ์ธ์. ์ด์๋ฅผ ๊ฒ์ํ๊ธฐ ์ ์ `python -c "import diffusers; print(diffusers.__version__)"` ๋ช
๋ น์ ์คํํ์ฌ ํ์ฌ ์ฌ์ฉ ์ค์ธ Diffusers ๋ฒ์ ์ด ์ต์ ๋ฒ์ ๊ณผ ์ผ์นํ๊ฑฐ๋ ๋ ๋์์ง ํ์ธํด์ฃผ์ธ์.
- ์๋ก์ด ์ด์๋ฅผ ์ด ๋ ํฌ์ํ๋ ๋
ธ๋ ฅ์ด ๋ง์์๋ก ๋ต๋ณ์ ํ์ง์ด ๋์์ง๊ณ Diffusers ์ด์ ์ ์ฒด์ ํ์ง๋ ํฅ์๋ฉ๋๋ค.
-#### 2.1 ์ฌํ๊ฐ๋ฅํ๊ณ ์ต์ํ์ธ ๋ฒ๊ทธ ๋ฆฌํฌํธ
+#### 2.1 ์ฌํ ๊ฐ๋ฅํ ์ต์ํ์ ๋ฒ๊ทธ ๋ฆฌํฌํธ [[21-reproducible-minimal-bug-reports]]
-์๋ก์ด ์ด์๋ ์ผ๋ฐ์ ์ผ๋ก ๋ค์๊ณผ ๊ฐ์ ๋ด์ฉ์ ํฌํจํฉ๋๋ค.
-๋ฒ๊ทธ ๋ณด๊ณ ์๋ ํญ์ ์ฌํ ๊ฐ๋ฅํ ์ฝ๋ ์กฐ๊ฐ์ ํฌํจํ๊ณ ๊ฐ๋ฅํ ํ ์ต์ํ์ด์ด์ผ ํ๋ฉฐ ๊ฐ๊ฒฐํด์ผ ํฉ๋๋ค.
+๋ฒ๊ทธ ๋ฆฌํฌํธ๋ ํญ์ ์ฌํ ๊ฐ๋ฅํ ์ฝ๋ ์กฐ๊ฐ์ ํฌํจํ๊ณ ๊ฐ๋ฅํ ํ ์ต์ํ์ด์ด์ผ ํ๋ฉฐ ๊ฐ๊ฒฐํด์ผ ํฉ๋๋ค.
์์ธํ ๋งํ๋ฉด:
- ๋ฒ๊ทธ๋ฅผ ๊ฐ๋ฅํ ํ ์ขํ์ผ ํฉ๋๋ค. **์ ์ฒด ์ฝ๋ ํ์ผ์ ๊ทธ๋ฅ ๋์ง์ง ๋ง์ธ์**.
- ์ฝ๋์ ์์์ ์ง์ ํด์ผ ํฉ๋๋ค.
- Diffusers๊ฐ ์์กดํ๋ ์ธ๋ถ ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ฅผ ์ ์ธํ ๋ค๋ฅธ ์ธ๋ถ ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ ํฌํจํ์ง ๋ง์ญ์์ค.
-- **๋ฐ๋์** ํ๊ฒฝ์ ๋ํ ๋ชจ๋ ํ์ํ ์ ๋ณด๋ฅผ ์ ๊ณตํด์ผ ํฉ๋๋ค. ์ด๋ฅผ ์ํด ์์์ `diffusers-cli env`๋ฅผ ์คํํ๊ณ ํ์๋ ์ ๋ณด๋ฅผ ์ด์์ ๋ณต์ฌํ์ฌ ๋ถ์ฌ๋ฃ์ ์ ์์ต๋๋ค.
-- ์ด์๋ฅผ ์ค๋ช
ํด์ผ ํฉ๋๋ค. ๋
์๊ฐ ๋ฌธ์ ๊ฐ ๋ฌด์์ด๋ฉฐ ์ ๋ฌธ์ ์ธ์ง ๋ชจ๋ฅด๋ฉด ํด๊ฒฐํ ์ ์์ต๋๋ค.
-- **ํญ์** ๋
์๊ฐ ๊ฐ๋ฅํ ํ ์ ์ ๋
ธ๋ ฅ์ผ๋ก ๋ฌธ์ ๋ฅผ ์ฌํํ ์ ์๋๋ก ํด์ผ ํฉ๋๋ค. ์ฝ๋ ์กฐ๊ฐ์ด ๋ผ์ด๋ธ๋ฌ๋ฆฌ๊ฐ ์๊ฑฐ๋ ์ ์๋์ง ์์ ๋ณ์ ๋๋ฌธ์ ์คํ๋์ง ์๋ ๊ฒฝ์ฐ ๋
์๊ฐ ๋์์ ์ค ์ ์์ต๋๋ค. ์ฌํ ๊ฐ๋ฅํ ์ฝ๋ ์กฐ๊ฐ์ด ๊ฐ๋ฅํ ํ ์ต์ํ๋๊ณ ๊ฐ๋จํ Python ์
ธ์ ๋ณต์ฌํ์ฌ ๋ถ์ฌ๋ฃ์ ์ ์๋๋ก ํด์ผ ํฉ๋๋ค.
+- **ํญ์** ์ฌ์ฉ์ ํ๊ฒฝ์ ๋ํ ๋ชจ๋ ํ์ํ ์ ๋ณด๋ฅผ ์ ๊ณตํ์ธ์. ์ด๋ฅผ ์ํด ์์์ `diffusers-cli env`๋ฅผ ์คํํ๊ณ ํ์๋ ์ ๋ณด๋ฅผ ์ด์์ ๋ณต์ฌํ์ฌ ๋ถ์ฌ๋ฃ์ ์ ์์ต๋๋ค.
+- ์ด์๋ฅผ ์ค๋ช
ํด์ผ ํฉ๋๋ค. ๋
์๊ฐ ๋ฌธ์ ๊ฐ ๋ฌด์์ธ์ง, ์ ๋ฌธ์ ๊ฐ ๋๋์ง ๋ชจ๋ฅธ๋ค๋ฉด ์ด์๋ฅผ ํด๊ฒฐํ ์ ์์ต๋๋ค.
+- **ํญ์** ๋
์๊ฐ ๊ฐ๋ฅํ ํ ์ ์ ๋
ธ๋ ฅ์ผ๋ก ๋ฌธ์ ๋ฅผ ์ฌํํ ์ ์์ด์ผ ํฉ๋๋ค. ์ฝ๋ ์กฐ๊ฐ์ด ๋ผ์ด๋ธ๋ฌ๋ฆฌ๊ฐ ์๊ฑฐ๋ ์ ์๋์ง ์์ ๋ณ์ ๋๋ฌธ์ ์คํ๋์ง ์๋ ๊ฒฝ์ฐ ๋
์๊ฐ ๋์์ ์ค ์ ์์ต๋๋ค. ์ฌํ ๊ฐ๋ฅํ ์ฝ๋ ์กฐ๊ฐ์ด ๊ฐ๋ฅํ ํ ์ต์ํ๋๊ณ ๊ฐ๋จํ Python ์
ธ์ ๋ณต์ฌํ์ฌ ๋ถ์ฌ๋ฃ์ ์ ์๋๋ก ํด์ผ ํฉ๋๋ค.
- ๋ฌธ์ ๋ฅผ ์ฌํํ๊ธฐ ์ํด ๋ชจ๋ธ๊ณผ/๋๋ ๋ฐ์ดํฐ์
์ด ํ์ํ ๊ฒฝ์ฐ ๋
์๊ฐ ํด๋น ๋ชจ๋ธ์ด๋ ๋ฐ์ดํฐ์
์ ์ ๊ทผํ ์ ์๋๋ก ํด์ผ ํฉ๋๋ค. ๋ชจ๋ธ์ด๋ ๋ฐ์ดํฐ์
์ [Hub](https://huggingface.co)์ ์
๋ก๋ํ์ฌ ์ฝ๊ฒ ๋ค์ด๋ก๋ํ ์ ์๋๋ก ํ ์ ์์ต๋๋ค. ๋ฌธ์ ์ฌํ์ ๊ฐ๋ฅํ ํ ์ฝ๊ฒํ๊ธฐ ์ํด ๋ชจ๋ธ๊ณผ ๋ฐ์ดํฐ์
์ ๊ฐ๋ฅํ ํ ์๊ฒ ์ ์งํ๋ ค๊ณ ๋
ธ๋ ฅํ์ธ์.
์์ธํ ๋ด์ฉ์ [์ข์ ์ด์ ์์ฑ ๋ฐฉ๋ฒ](#how-to-write-a-good-issue) ์น์
์ ์ฐธ์กฐํ์ธ์.
-๋ฒ๊ทธ ๋ณด๊ณ ์๋ฅผ ์ด๋ ค๋ฉด [์ฌ๊ธฐ](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=bug&projects=&template=bug-report.yml)๋ฅผ ํด๋ฆญํ์ธ์.
+๋ฒ๊ทธ ๋ฆฌํฌํธ๋ฅผ ์ด๋ ค๋ฉด [์ฌ๊ธฐ](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=bug&projects=&template=bug-report.yml)๋ฅผ ํด๋ฆญํ์ธ์.
-#### 2.2. ๊ธฐ๋ฅ ์์ฒญ
+#### 2.2. ๊ธฐ๋ฅ ์์ฒญ [[22-feature-requests]]
์ธ๊ณ์ ์ธ ๊ธฐ๋ฅ ์์ฒญ์ ๋ค์ ์ฌํญ์ ๋ค๋ฃน๋๋ค:
1. ๋จผ์ ๋๊ธฐ๋ถ์ฌ:
-* ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ๊ด๋ จ๋ ๋ฌธ์ /๋ถ๋ง์ด ์๋๊ฐ์? ๊ทธ๋ ๋ค๋ฉด ์ ๊ทธ๋ฐ์ง ์ค๋ช
ํด์ฃผ์ธ์. ๋ฌธ์ ๋ฅผ ๋ณด์ฌ์ฃผ๋ ์ฝ๋ ์กฐ๊ฐ์ ์ ๊ณตํ๋ ๊ฒ์ด ๊ฐ์ฅ ์ข์ต๋๋ค.
+* ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ๊ด๋ จ๋ ๋ฌธ์ /๋ถ๋ง์ด ์๋์? ๊ทธ๋ ๋ค๋ฉด ์ ๊ทธ๋ฐ์ง ์ค๋ช
ํด์ฃผ์ธ์. ๋ฌธ์ ๋ฅผ ๋ณด์ฌ์ฃผ๋ ์ฝ๋ ์กฐ๊ฐ์ ์ ๊ณตํ๋ ๊ฒ์ด ๊ฐ์ฅ ์ข์ต๋๋ค.
* ํ๋ก์ ํธ์ ํ์ํ ๊ธฐ๋ฅ์ธ๊ฐ์? ์ฐ๋ฆฌ๋ ๊ทธ์ ๋ํด ๋ฃ๊ณ ์ถ์ต๋๋ค!
* ์ปค๋ฎค๋ํฐ์ ๋์์ด ๋ ์ ์๋ ๊ฒ์ ์์
ํ๊ณ ๊ทธ๊ฒ์ ๋ํด ์๊ฐํ๊ณ ์๋๊ฐ์? ๋ฉ์ง๋ค์! ์ด๋ค ๋ฌธ์ ๋ฅผ ํด๊ฒฐํ๋์ง ์๋ ค์ฃผ์ธ์.
2. ๊ธฐ๋ฅ์ *์์ธํ ์ค๋ช
ํ๋* ๋ฌธ๋จ์ ์์ฑํด์ฃผ์ธ์;
-3. ๋ฏธ๋ ์ฌ์ฉ์ ๋ณด์ฌ์ฃผ๋ **์ฝ๋ ์กฐ๊ฐ**์ ์ ๊ณตํด์ฃผ์ธ์;
-4. ์ด๊ฒ์ด ๋
ผ๋ฌธ๊ณผ ๊ด๋ จ๋ ๊ฒฝ์ฐ ๋งํฌ๋ฅผ ์ฒจ๋ถํด์ฃผ์ธ์;
-5. ๋์์ด ๋ ์ ์๋ ์ถ๊ฐ ์ ๋ณด(๊ทธ๋ฆผ, ์คํฌ๋ฆฐ์ท ๋ฑ)๋ฅผ ์ฒจ๋ถํด์ฃผ์ธ์.
+3. ํฅํ ์ฌ์ฉ์ ๋ณด์ฌ์ฃผ๋ **์ฝ๋ ์กฐ๊ฐ**์ ์ ๊ณตํด์ฃผ์ธ์;
+4. ๋
ผ๋ฌธ๊ณผ ๊ด๋ จ๋ ๋ด์ฉ์ธ ๊ฒฝ์ฐ ๋งํฌ๋ฅผ ์ฒจ๋ถํด์ฃผ์ธ์;
+5. ๋์์ด ๋ ์ ์๋ค๊ณ ์๊ฐ๋๋ ์ถ๊ฐ ์ ๋ณด(๊ทธ๋ฆผ, ์คํฌ๋ฆฐ์ท ๋ฑ)๋ฅผ ์ฒจ๋ถํด์ฃผ์ธ์.
๊ธฐ๋ฅ ์์ฒญ์ [์ฌ๊ธฐ](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feature_request.md&title=)์์ ์ด ์ ์์ต๋๋ค.
-#### 2.3 ํผ๋๋ฐฑ
+#### 2.3 ํผ๋๋ฐฑ [[23-feedback]]
-๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋์์ธ๊ณผ ๊ทธ๊ฒ์ด ์ ์ข์์ง ๋๋ ๋์์ง์ ๋ํ ์ด์ ์ ๋ํ ํผ๋๋ฐฑ์ ํต์ฌ ๋ฉ์ธํ
์ด๋๊ฐ ์ฌ์ฉ์ ์นํ์ ์ธ ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ฅผ ๋ง๋๋ ๋ฐ ์์ฒญ๋ ๋์์ด ๋ฉ๋๋ค. ํ์ฌ ๋์์ธ ์ฒ ํ์ ์ดํดํ๋ ค๋ฉด [์ฌ๊ธฐ](https://huggingface.co/docs/diffusers/conceptual/philosophy)๋ฅผ ์ฐธ์กฐํด ์ฃผ์ธ์. ํน์ ๋์์ธ ์ ํ์ด ํ์ฌ ๋์์ธ ์ฒ ํ๊ณผ ๋ง์ง ์๋๋ค๊ณ ์๊ฐ๋๋ฉด, ๊ทธ ์ด์ ์ ์ด๋ป๊ฒ ๋ณ๊ฒฝ๋์ด์ผ ํ๋์ง ์ค๋ช
ํด ์ฃผ์ธ์. ๋ฐ๋๋ก ํน์ ๋์์ธ ์ ํ์ด ๋์์ธ ์ฒ ํ์ ๋๋ฌด ๋ฐ๋ฅด๊ธฐ ๋๋ฌธ์ ์ฌ์ฉ ์ฌ๋ก๋ฅผ ์ ํํ๋ค๊ณ ์๊ฐ๋๋ฉด, ๊ทธ ์ด์ ์ ์ด๋ป๊ฒ ๋ณ๊ฒฝ๋์ด์ผ ํ๋์ง ์ค๋ช
ํด ์ฃผ์ธ์. ํน์ ๋์์ธ ์ ํ์ด ๋งค์ฐ ์ ์ฉํ๋ค๊ณ ์๊ฐ๋๋ฉด, ๋ฏธ๋์ ๋์์ธ ๊ฒฐ์ ์ ํฐ ๋์์ด ๋๋ฏ๋ก ์ด์ ๋ํ ์๊ฒฌ์ ๋จ๊ฒจ ์ฃผ์ธ์.
+๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋์์ธ๊ณผ ๊ทธ๊ฒ์ด ์ ์ข์์ง ๋๋ ๋์์ง์ ๋ํ ์ด์ ์ ๋ํ ํผ๋๋ฐฑ์ ํต์ฌ ๋ฉ์ธํ
์ด๋๊ฐ ์ฌ์ฉ์ ์นํ์ ์ธ ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ฅผ ๋ง๋๋ ๋ฐ ์์ฒญ๋ ๋์์ด ๋ฉ๋๋ค. ํ์ฌ ๋์์ธ ์ฒ ํ์ ์ดํดํ๋ ค๋ฉด [์ฌ๊ธฐ](https://huggingface.co/docs/diffusers/conceptual/philosophy)๋ฅผ ์ฐธ์กฐํด ์ฃผ์ธ์. ํน์ ๋์์ธ ์ ํ์ด ํ์ฌ ๋์์ธ ์ฒ ํ๊ณผ ๋ง์ง ์๋๋ค๊ณ ์๊ฐ๋๋ฉด, ๊ทธ ์ด์ ์ ์ด๋ป๊ฒ ๋ณ๊ฒฝ๋์ด์ผ ํ๋์ง ์ค๋ช
ํด ์ฃผ์ธ์. ๋ฐ๋๋ก ํน์ ๋์์ธ ์ ํ์ด ๋์์ธ ์ฒ ํ์ ๋๋ฌด ๋ฐ๋ฅด๊ธฐ ๋๋ฌธ์ ์ฌ์ฉ ์ฌ๋ก๋ฅผ ์ ํํ๋ค๊ณ ์๊ฐ๋๋ฉด, ๊ทธ ์ด์ ์ ์ด๋ป๊ฒ ๋ณ๊ฒฝ๋์ด์ผ ํ๋์ง ์ค๋ช
ํด ์ฃผ์ธ์. ํน์ ๋์์ธ ์ ํ์ด ๋งค์ฐ ์ ์ฉํ๋ค๊ณ ์๊ฐ๋๋ฉด, ํฅํ ๋์์ธ ๊ฒฐ์ ์ ํฐ ๋์์ด ๋๋ฏ๋ก ์ด์ ๋ํ ์๊ฒฌ์ ๋จ๊ฒจ ์ฃผ์ธ์.
ํผ๋๋ฐฑ์ ๊ดํ ์ด์๋ [์ฌ๊ธฐ](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=)์์ ์ด ์ ์์ต๋๋ค.
-#### 2.4 ๊ธฐ์ ์ ์ธ ์ง๋ฌธ
+#### 2.4 ๊ธฐ์ ์ ์ธ ์ง๋ฌธ [[24-technical-questions]]
๊ธฐ์ ์ ์ธ ์ง๋ฌธ์ ์ฃผ๋ก ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ํน์ ์ฝ๋๊ฐ ์ ํน์ ๋ฐฉ์์ผ๋ก ์์ฑ๋์๋์ง ๋๋ ์ฝ๋์ ํน์ ๋ถ๋ถ์ด ๋ฌด์์ ํ๋์ง์ ๋ํ ์ง๋ฌธ์
๋๋ค. ์ง๋ฌธํ์ ์ฝ๋ ๋ถ๋ถ์ ๋ํ ๋งํฌ๋ฅผ ์ ๊ณตํ๊ณ ํด๋น ์ฝ๋ ๋ถ๋ถ์ด ์ดํดํ๊ธฐ ์ด๋ ค์ด ์ด์ ์ ๋ํ ์์ธํ ์ค๋ช
์ ํด์ฃผ์๊ธฐ ๋ฐ๋๋๋ค.
๊ธฐ์ ์ ์ธ ์ง๋ฌธ์ ๊ดํ ์ด์๋ฅผ [์ฌ๊ธฐ](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=bug&template=bug-report.yml)์์ ์ด ์ ์์ต๋๋ค.
-#### 2.5 ์๋ก์ด ๋ชจ๋ธ, ์ค์ผ์ค๋ฌ ๋๋ ํ์ดํ๋ผ์ธ ์ถ๊ฐ ์ ์
+#### 2.5 ์๋ก์ด ๋ชจ๋ธ, ์ค์ผ์ค๋ฌ ๋๋ ํ์ดํ๋ผ์ธ ์ถ๊ฐ ์ ์ [[25-proposal-to-add-a-new-model-scheduler-or-pipeline]]
๋ง์ฝ diffusion ๋ชจ๋ธ ์ปค๋ฎค๋ํฐ์์ Diffusers ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ์ถ๊ฐํ๊ณ ์ถ์ ์๋ก์ด ๋ชจ๋ธ, ํ์ดํ๋ผ์ธ ๋๋ ์ค์ผ์ค๋ฌ๊ฐ ์๋ค๋ฉด, ๋ค์ ์ ๋ณด๋ฅผ ์ ๊ณตํด์ฃผ์ธ์:
@@ -135,34 +134,34 @@ Diffusers ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ๊ด๋ จ๋ ๋ชจ๋ ์ง๋ฌธ์ด๋ ์๊ฒฌ์ [ํ ๋ก ํฌ
* ํด๋น ๋ชจ๋ธ์ ์คํ ์์ค ๊ตฌํ์ ๋ํ ๋งํฌ
* ๋ชจ๋ธ ๊ฐ์ค์น๊ฐ ์๋ ๊ฒฝ์ฐ, ๊ฐ์ค์น์ ๋งํฌ
-๋ชจ๋ธ์ ์ง์ ๊ธฐ์ฌํ๊ณ ์ ํ๋ ๊ฒฝ์ฐ, ์ต์ ์ ์๋ด๋ฅผ ์ํด ์ฐ๋ฆฌ์๊ฒ ์๋ ค์ฃผ์ธ์. ๋ํ, ๊ฐ๋ฅํ๋ค๋ฉด ๊ตฌ์ฑ ์์(๋ชจ๋ธ, ์ค์ผ์ค๋ฌ, ํ์ดํ๋ผ์ธ ๋ฑ)์ ์๋ ์ ์๋ฅผ GitHub ํธ๋ค๋ก ํ๊ทธํ๋ ๊ฒ์ ์์ง ๋ง์ธ์.
+์ง์ ๋ชจ๋ธ์ ๊ธฐ์ฌํ๊ณ ์ถ๋ค๋ฉด, ๊ฐ์ฅ ์ ์๋ดํด๋๋ฆด ์ ์์ต๋๋ค. ๋ํ, ๊ฐ๋ฅํ๋ค๋ฉด ๊ตฌ์ฑ ์์(๋ชจ๋ธ, ์ค์ผ์ค๋ฌ, ํ์ดํ๋ผ์ธ ๋ฑ)์ ์์ ์๋ฅผ GitHub ํธ๋ค๋ก ํ๊ทธํ๋ ๊ฒ์ ์์ง ๋ง์ธ์.
๋ชจ๋ธ/ํ์ดํ๋ผ์ธ/์ค์ผ์ค๋ฌ์ ๋ํ ์์ฒญ์ [์ฌ๊ธฐ](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=New+model%2Fpipeline%2Fscheduler&template=new-model-addition.yml)์์ ์ด ์ ์์ต๋๋ค.
-### 3. GitHub ์ด์ ํญ์์ ๋ฌธ์ ์ ๋ํ ๋ต๋ณํ๊ธฐ
+### 3. GitHub ์ด์ ํญ์์ ๋ฌธ์ ์ ๋ํ ๋ต๋ณํ๊ธฐ [[3-answering-issues-on-the-github-issues-tab]]
GitHub์์ ์ด์์ ๋ํ ๋ต๋ณ์ ํ๊ธฐ ์ํด์๋ Diffusers์ ๋ํ ๊ธฐ์ ์ ์ธ ์ง์์ด ํ์ํ ์ ์์ง๋ง, ์ ํํ ๋ต๋ณ์ด ์๋๋๋ผ๋ ๋ชจ๋๊ฐ ์๋ํด๊ธฐ๋ฅผ ๊ถ์ฅํฉ๋๋ค. ์ด์์ ๋ํ ๊ณ ํ์ง ๋ต๋ณ์ ์ ๊ณตํ๊ธฐ ์ํ ๋ช ๊ฐ์ง ํ:
- ๊ฐ๋ฅํ ํ ๊ฐ๊ฒฐํ๊ณ ์ต์ํ์ผ๋ก ์ ์งํฉ๋๋ค.
- ์ฃผ์ ์ ์ง์คํฉ๋๋ค. ์ด์์ ๋ํ ๋ต๋ณ์ ํด๋น ์ด์์ ๊ด๋ จ๋ ๋ด์ฉ์๋ง ์ง์คํด์ผ ํฉ๋๋ค.
-- ์ฝ๋, ๋
ผ๋ฌธ ๋๋ ๋ค๋ฅธ ์์ค๋ฅผ ์ ๊ณตํ์ฌ ๋ต๋ณ์ ์ฆ๋ช
ํ๊ฑฐ๋ ์ง์งํฉ๋๋ค.
+- ์์ ์ ์ฃผ์ฅ์ ์ฆ๋ช
ํ๊ฑฐ๋ ์ฅ๋ คํ๋ ์ฝ๋, ๋
ผ๋ฌธ ๋๋ ๊ธฐํ ์ถ์ฒ๋ ๋งํฌ๋ฅผ ์ ๊ณตํ์ธ์.
- ์ฝ๋๋ก ๋ต๋ณํฉ๋๋ค. ๊ฐ๋จํ ์ฝ๋ ์กฐ๊ฐ์ด ์ด์์ ๋ํ ๋ต๋ณ์ด๊ฑฐ๋ ์ด์๋ฅผ ํด๊ฒฐํ๋ ๋ฐฉ๋ฒ์ ๋ณด์ฌ์ค๋ค๋ฉด, ์์ ํ ์ฌํ ๊ฐ๋ฅํ ์ฝ๋ ์กฐ๊ฐ์ ์ ๊ณตํด์ฃผ์ธ์.
๋ํ, ๋ง์ ์ด์๋ค์ ๋จ์ํ ์ฃผ์ ์ ๋ฌด๊ดํ๊ฑฐ๋ ๋ค๋ฅธ ์ด์์ ์ค๋ณต์ด๊ฑฐ๋ ๊ด๋ จ์ด ์๋ ๊ฒฝ์ฐ๊ฐ ๋ง์ต๋๋ค. ์ด๋ฌํ ์ด์๋ค์ ๋ํ ๋ต๋ณ์ ์ ๊ณตํ๊ณ , ์ด์ ์์ฑ์์๊ฒ ๋ ์ ํํ ์ ๋ณด๋ฅผ ์ ๊ณตํ๊ฑฐ๋, ์ค๋ณต๋ ์ด์์ ๋ํ ๋งํฌ๋ฅผ ์ ๊ณตํ๊ฑฐ๋, [ํฌ๋ผ](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63) ์ด๋ [Discord](https://discord.gg/G7tWnz98XR)๋ก ๋ฆฌ๋๋ ์
ํ๋ ๊ฒ์ ๋ฉ์ธํ
์ด๋์๊ฒ ํฐ ๋์์ด ๋ฉ๋๋ค.
์ด์๊ฐ ์ฌ๋ฐ๋ฅธ ๋ฒ๊ทธ ๋ณด๊ณ ์์ด๊ณ ์์ค ์ฝ๋์์ ์์ ์ด ํ์ํ๋ค๊ณ ํ์ธํ ๊ฒฝ์ฐ, ๋ค์ ์น์
์ ์ดํด๋ณด์ธ์.
-๋ค์ ๋ชจ๋ ๊ธฐ์ฌ์ ๋ํด์๋ PR์ ์ด์ฌ์ผ ํฉ๋๋ค. [PR ์ด๊ธฐ](#how-to-open-a-pr) ์น์
์์ ์์ธํ ์ค๋ช
๋์ด ์์ต๋๋ค.
+๋ค์ ๋ชจ๋ ๊ธฐ์ฌ์ ๋ํด์๋ PR์ ์ด์ฌ์ผ ํฉ๋๋ค. [Pull Request ์ด๊ธฐ](#how-to-open-a-pr) ์น์
์์ ์์ธํ ์ค๋ช
๋์ด ์์ต๋๋ค.
-### 4. "Good first issue" ๊ณ ์น๊ธฐ
+### 4. "Good first issue" ๊ณ ์น๊ธฐ [[4-fixing-a-good-first-issue]]
*Good first issues*๋ [Good first issue](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) ๋ผ๋ฒจ๋ก ํ์๋ฉ๋๋ค. ์ผ๋ฐ์ ์ผ๋ก, ์ด์๋ ์ด๋ฏธ ์ ์ฌ์ ์ธ ํด๊ฒฐ์ฑ
์ด ์ด๋ป๊ฒ ๋ณด์ด๋์ง ์ค๋ช
ํ๊ณ ์์ด์ ์์ ํ๊ธฐ ์ฝ์ต๋๋ค.
๋ง์ฝ ์ด์๊ฐ ์์ง ๋ซํ์ง ์์๊ณ ์ด ๋ฌธ์ ๋ฅผ ํด๊ฒฐํด๋ณด๊ณ ์ถ๋ค๋ฉด, "์ด ์ด์๋ฅผ ํด๊ฒฐํด๋ณด๊ณ ์ถ์ต๋๋ค."๋ผ๋ ๋ฉ์์ง๋ฅผ ๋จ๊ธฐ๋ฉด ๋ฉ๋๋ค. ์ผ๋ฐ์ ์ผ๋ก ์ธ ๊ฐ์ง ์๋๋ฆฌ์ค๊ฐ ์์ต๋๋ค:
-- a.) ์ด์ ์ค๋ช
์ด ์ด๋ฏธ ํด๊ฒฐ์ฑ
์ ์ ์ํฉ๋๋ค. ์ด ๊ฒฝ์ฐ, ํด๊ฒฐ์ฑ
์ด ์ดํด๋๊ณ ํฉ๋ฆฌ์ ์ผ๋ก ๋ณด์ธ๋ค๋ฉด, PR ๋๋ ๋๋ํํธ PR์ ์ด์ด์ ์์ ํ ์ ์์ต๋๋ค.
-- b.) ์ด์ ์ค๋ช
์ด ํด๊ฒฐ์ฑ
์ ์ ์ํ์ง ์์ต๋๋ค. ์ด ๊ฒฝ์ฐ, ์ด๋ค ํด๊ฒฐ์ฑ
์ด ๊ฐ๋ฅํ ์ง ๋ฌผ์ด๋ณผ ์ ์๊ณ , Diffusers ํ์ ๋๊ตฐ๊ฐ๊ฐ ๊ณง ๋ต๋ณํด์ค ๊ฒ์
๋๋ค. ๋ง์ฝ ์ด๋ป๊ฒ ์์ ํ ์ง ์ข์ ์์ด๋์ด๊ฐ ์๋ค๋ฉด, ์ง์ PR์ ์ด์ด๋ ๋ฉ๋๋ค.
+- a.) ์ด์ ์ค๋ช
์ ์ด๋ฏธ ์์ ์ฌํญ์ ์ ์ํ๋ ๊ฒฝ์ฐ, ํด๊ฒฐ์ฑ
์ด ์ดํด๋๊ณ ํฉ๋ฆฌ์ ์ผ๋ก ๋ณด์ธ๋ค๋ฉด, PR ๋๋ ๋๋ํํธ PR์ ์ด์ด์ ์์ ํ ์ ์์ต๋๋ค.
+- b.) ์ด์ ์ค๋ช
์ ์์ ์ฌํญ์ด ์ ์๋์ด ์์ง ์์ ๊ฒฝ์ฐ, ์ ์ํ ์์ ์ฌํญ์ด ๊ฐ๋ฅํ ์ง ๋ฌผ์ด๋ณผ ์ ์๊ณ , Diffusers ํ์ ๋๊ตฐ๊ฐ๊ฐ ๊ณง ๋ต๋ณํด์ค ๊ฒ์
๋๋ค. ๋ง์ฝ ์ด๋ป๊ฒ ์์ ํ ์ง ์ข์ ์์ด๋์ด๊ฐ ์๋ค๋ฉด, ์ง์ PR์ ์ด์ด๋ ๋ฉ๋๋ค.
- c.) ์ด๋ฏธ ์ด ๋ฌธ์ ๋ฅผ ํด๊ฒฐํ๊ธฐ ์ํด ์ด๋ฆฐ PR์ด ์์ง๋ง, ์ด์๊ฐ ์์ง ๋ซํ์ง ์์์ต๋๋ค. PR์ด ๋ ์ด์ ์งํ๋์ง ์์๋ค๋ฉด, ์๋ก์ด PR์ ์ด๊ณ ์ด์ PR์ ๋งํฌ๋ฅผ ๊ฑธ๋ฉด ๋ฉ๋๋ค. PR์ ์ข
์ข
์๋ ๊ธฐ์ฌ์๊ฐ ๊ฐ์๊ธฐ ์๊ฐ์ ๋ด์ง ๋ชปํด ๋ ์ด์ ์งํํ์ง ๋ชปํ๋ ๊ฒฝ์ฐ์ ๋ ์ด์ ์งํ๋์ง ์๊ฒ ๋ฉ๋๋ค. ์ด๋ ์คํ ์์ค์์ ์์ฃผ ๋ฐ์ํ๋ ์ผ์ด๋ฉฐ ๋งค์ฐ ์ ์์ ์ธ ์ํฉ์
๋๋ค. ์ด ๊ฒฝ์ฐ, ์ปค๋ฎค๋ํฐ๋ ์๋ก ์๋ํ๊ณ ๊ธฐ์กด PR์ ์ง์์ ํ์ฉํด์ฃผ๋ฉด ๋งค์ฐ ๊ธฐ์ ๊ฒ์
๋๋ค. ์ด๋ฏธ PR์ด ์๊ณ ํ์ฑํ๋์ด ์๋ค๋ฉด, ์ ์์ ํด์ฃผ๊ฑฐ๋ PR์ ๊ฒํ ํ๊ฑฐ๋ PR์ ๊ธฐ์ฌํ ์ ์๋์ง ๋ฌผ์ด๋ณด๋ ๋ฑ ์์ฑ์๋ฅผ ๋์์ค ์ ์์ต๋๋ค.
-### 5. ๋ฌธ์์ ๊ธฐ์ฌํ๊ธฐ
+### 5. ๋ฌธ์์ ๊ธฐ์ฌํ๊ธฐ [[5-contribute-to-the-documentation]]
์ข์ ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ ํญ์ ์ข์ ๋ฌธ์๋ฅผ ๊ฐ๊ณ ์์ต๋๋ค! ๊ณต์ ๋ฌธ์๋ ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ฅผ ์ฒ์ ์ฌ์ฉํ๋ ์ฌ์ฉ์๋ค์๊ฒ ์ฒซ ๋ฒ์งธ ์ ์ ์ค ํ๋์ด๋ฉฐ, ๋ฐ๋ผ์ ๋ฌธ์์ ๊ธฐ์ฌํ๋ ๊ฒ์ ๋งค์ฐ ๊ฐ์น ์๋ ๊ธฐ์ฌ์
๋๋ค.
@@ -180,7 +179,7 @@ GitHub์์ ์ด์์ ๋ํ ๋ต๋ณ์ ํ๊ธฐ ์ํด์๋ Diffusers์ ๋ํ
๋ฌธ์์ ๋ํ ๋ณ๊ฒฝ ์ฌํญ์ ๋ก์ปฌ์์ ํ์ธํ๋ ๋ฐฉ๋ฒ์ [์ด ํ์ด์ง](https://github.com/huggingface/diffusers/tree/main/docs)๋ฅผ ์ฐธ์กฐํด์ฃผ์ธ์.
-### 6. ์ปค๋ฎค๋ํฐ ํ์ดํ๋ผ์ธ์ ๊ธฐ์ฌํ๊ธฐ
+### 6. ์ปค๋ฎค๋ํฐ ํ์ดํ๋ผ์ธ์ ๊ธฐ์ฌํ๊ธฐ [[6-contribute-a-community-pipeline]]
> [!TIP]
> ์ปค๋ฎค๋ํฐ ํ์ดํ๋ผ์ธ์ ๋ํด ์์ธํ ์์๋ณด๋ ค๋ฉด [์ปค๋ฎค๋ํฐ ํ์ดํ๋ผ์ธ](../using-diffusers/custom_pipeline_overview#community-pipelines) ๊ฐ์ด๋๋ฅผ ์ฝ์ด๋ณด์ธ์. ์ปค๋ฎค๋ํฐ ํ์ดํ๋ผ์ธ์ด ์ ํ์ํ์ง ๊ถ๊ธํ๋ค๋ฉด GitHub ์ด์ [#841](https://github.com/huggingface/diffusers/issues/841)๋ฅผ ํ์ธํด๋ณด์ธ์ (๊ธฐ๋ณธ์ ์ผ๋ก, ์ฐ๋ฆฌ๋ diffusion ๋ชจ๋ธ์ด ์ถ๋ก ์ ์ฌ์ฉ๋ ์ ์๋ ๋ชจ๋ ๋ฐฉ๋ฒ์ ์ ์งํ ์ ์์ง๋ง ์ปค๋ฎค๋ํฐ๊ฐ ์ด๋ฅผ ๊ตฌ์ถํ๋ ๊ฒ์ ๋ฐฉํดํ๊ณ ์ถ์ง ์์ต๋๋ค).
@@ -246,7 +245,7 @@ output = pipeline()
-GitHub ํ์ดํ๋ผ์ธ์ ๊ณต์ ํ๋ ค๋ฉด Diffusers [์ ์ฅ์](https://github.com/huggingface/diffusers)์์ PR์ ์ด๊ณ one_step_unet.py ํ์ผ์ [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) ํ์ ํด๋์ ์ถ๊ฐํ์ธ์.
+GitHub ํ์ดํ๋ผ์ธ์ ๊ณต์ ํ๋ ค๋ฉด Diffusers [์ ์ฅ์](https://github.com/huggingface/diffusers)์์ Pull Request๋ฅผ ์ด๊ณ one_step_unet.py ํ์ผ์ [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) ํ์ ํด๋์ ์ถ๊ฐํ์ธ์.
@@ -256,7 +255,7 @@ Hub ํ์ดํ๋ผ์ธ์ ๊ณต์ ํ๋ ค๋ฉด, ํ๋ธ์ ๋ชจ๋ธ ์ ์ฅ์๋ฅผ ์์ฑํ
-### 7. ํ๋ จ ์์ ์ ๊ธฐ์ฌํ๊ธฐ
+### 7. ํ๋ จ ์์ ์ ๊ธฐ์ฌํ๊ธฐ [[7-contribute-to-training-examples]]
Diffusers ์์ ๋ [examples](https://github.com/huggingface/diffusers/tree/main/examples) ํด๋์ ์๋ ํ๋ จ ์คํฌ๋ฆฝํธ์ ๋ชจ์์
๋๋ค.
@@ -268,7 +267,7 @@ Diffusers ์์ ๋ [examples](https://github.com/huggingface/diffusers/tree/mai
์ฐ๊ตฌ์ฉ ํ๋ จ ์์ ๋ [examples/research_projects](https://github.com/huggingface/diffusers/tree/main/examples/research_projects)์ ์์นํ๋ฉฐ, ๊ณต์ ํ๋ จ ์์ ๋ `research_projects` ๋ฐ `community` ํด๋๋ฅผ ์ ์ธํ [examples](https://github.com/huggingface/diffusers/tree/main/examples)์ ๋ชจ๋ ํด๋๋ฅผ ํฌํจํฉ๋๋ค.
๊ณต์ ํ๋ จ ์์ ๋ Diffusers์ ํต์ฌ ๋ฉ์ธํ
์ด๋๊ฐ ์ ์ง ๊ด๋ฆฌํ๋ฉฐ, ์ฐ๊ตฌ์ฉ ํ๋ จ ์์ ๋ ์ปค๋ฎค๋ํฐ๊ฐ ์ ์ง ๊ด๋ฆฌํฉ๋๋ค.
์ด๋ ๊ณต์ ํ์ดํ๋ผ์ธ vs ์ปค๋ฎค๋ํฐ ํ์ดํ๋ผ์ธ์ ๋ํ [6. ์ปค๋ฎค๋ํฐ ํ์ดํ๋ผ์ธ ๊ธฐ์ฌํ๊ธฐ](#6-contribute-a-community-pipeline)์์ ์ ์ํ ์ด์ ์ ๋์ผํฉ๋๋ค: ํต์ฌ ๋ฉ์ธํ
์ด๋๊ฐ diffusion ๋ชจ๋ธ์ ๋ชจ๋ ๊ฐ๋ฅํ ํ๋ จ ๋ฐฉ๋ฒ์ ์ ์ง ๊ด๋ฆฌํ๋ ๊ฒ์ ํ์ค์ ์ผ๋ก ๋ถ๊ฐ๋ฅํฉ๋๋ค.
-Diffusers ํต์ฌ ๋ฉ์ธํ
์๋์ ์ปค๋ฎค๋ํฐ๊ฐ ํน์ ํ๋ จ ํจ๋ฌ๋ค์์ ๋๋ฌด ์คํ์ ์ด๊ฑฐ๋ ์ถฉ๋ถํ ์ธ๊ธฐ ์๋ ๊ฒ์ผ๋ก ๊ฐ์ฃผํ๋ ๊ฒฝ์ฐ, ํด๋น ํ๋ จ ์ฝ๋๋ `research_projects` ํด๋์ ๋ฃ๊ณ ์์ฑ์๊ฐ ์ ์ง ๊ด๋ฆฌํด์ผ ํฉ๋๋ค.
+Diffusers ํต์ฌ ๋ฉ์ธํ
์ด๋์ ์ปค๋ฎค๋ํฐ๊ฐ ํน์ ํ๋ จ ํจ๋ฌ๋ค์์ ๋๋ฌด ์คํ์ ์ด๊ฑฐ๋ ์ถฉ๋ถํ ๋์ค์ ์ด์ง ์๋ค๊ณ ํ๋จํ๋ค๋ฉด, ํด๋น ํ๋ จ ์ฝ๋๋ `research_projects` ํด๋์ ๋ฃ๊ณ ์์ฑ์์ ์ํด ๊ด๋ฆฌ๋์ด์ผ ํฉ๋๋ค.
๊ณต์ ํ๋ จ ๋ฐ ์ฐ๊ตฌ ์์ ๋ ํ๋ ์ด์์ ํ๋ จ ์คํฌ๋ฆฝํธ, requirements.txt ํ์ผ ๋ฐ README.md ํ์ผ์ ํฌํจํ๋ ๋๋ ํ ๋ฆฌ๋ก ๊ตฌ์ฑ๋ฉ๋๋ค. ์ฌ์ฉ์๊ฐ ํ๋ จ ์์ ๋ฅผ ์ฌ์ฉํ๋ ค๋ฉด ๋ฆฌํฌ์งํ ๋ฆฌ๋ฅผ ๋ณต์ ํด์ผ ํฉ๋๋ค:
@@ -298,14 +297,14 @@ Diffusers์ ๊ธด๋ฐํ๊ฒ ํตํฉ๋์ด ์๊ธฐ ๋๋ฌธ์, ๊ธฐ์ฌ์๋ค์ด [Accele
๋ง์ฝ ๊ณต์ ํ๋ จ ์์ ์ ๊ธฐ์ฌํ๋ ๊ฒฝ์ฐ, [examples/test_examples.py](https://github.com/huggingface/diffusers/blob/main/examples/test_examples.py)์ ํ
์คํธ๋ฅผ ์ถ๊ฐํ๋ ๊ฒ๋ ํ์ธํด์ฃผ์ธ์. ๋น๊ณต์ ํ๋ จ ์์ ์๋ ์ด ์์
์ด ํ์ํ์ง ์์ต๋๋ค.
-### 8. "Good second issue" ๊ณ ์น๊ธฐ
+### 8. "Good second issue" ๊ณ ์น๊ธฐ [[8-fixing-a-good-second-issue]]
"Good second issue"๋ [Good second issue](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22Good+second+issue%22) ๋ผ๋ฒจ๋ก ํ์๋ฉ๋๋ค. Good second issue๋ [Good first issues](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)๋ณด๋ค ํด๊ฒฐํ๊ธฐ๊ฐ ๋ ๋ณต์กํฉ๋๋ค.
์ด์ ์ค๋ช
์ ์ผ๋ฐ์ ์ผ๋ก ์ด์๋ฅผ ํด๊ฒฐํ๋ ๋ฐฉ๋ฒ์ ๋ํด ๋ ๊ตฌ์ฒด์ ์ด๋ฉฐ, ๊ด์ฌ ์๋ ๊ธฐ์ฌ์๋ ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ๋ํ ๊ฝค ๊น์ ์ดํด๊ฐ ํ์ํฉ๋๋ค.
Good second issue๋ฅผ ํด๊ฒฐํ๊ณ ์ ํ๋ ๊ฒฝ์ฐ, ํด๋น ์ด์๋ฅผ ํด๊ฒฐํ๊ธฐ ์ํด PR์ ์ด๊ณ PR์ ์ด์์ ๋งํฌํ์ธ์. ์ด๋ฏธ ํด๋น ์ด์์ ๋ํ PR์ด ์ด๋ ค์์ง๋ง ๋ณํฉ๋์ง ์์ ๊ฒฝ์ฐ, ์ ๋ณํฉ๋์ง ์์๋์ง ์ดํดํ๊ธฐ ์ํด ์ดํด๋ณด๊ณ ๊ฐ์ ๋ PR์ ์ด์ด๋ณด์ธ์.
Good second issue๋ ์ผ๋ฐ์ ์ผ๋ก Good first issue ์ด์๋ณด๋ค ๋ณํฉํ๊ธฐ๊ฐ ๋ ์ด๋ ค์ฐ๋ฏ๋ก, ํต์ฌ ๋ฉ์ธํ
์ด๋์๊ฒ ๋์์ ์์ฒญํ๋ ๊ฒ์ด ์ข์ต๋๋ค. PR์ด ๊ฑฐ์ ์๋ฃ๋ ๊ฒฝ์ฐ, ํต์ฌ ๋ฉ์ธํ
์ด๋๋ PR์ ์ฐธ์ฌํ์ฌ ์ปค๋ฐํ๊ณ ๋ณํฉ์ ์งํํ ์ ์์ต๋๋ค.
-### 9. ํ์ดํ๋ผ์ธ, ๋ชจ๋ธ, ์ค์ผ์ค๋ฌ ์ถ๊ฐํ๊ธฐ
+### 9. ํ์ดํ๋ผ์ธ, ๋ชจ๋ธ, ์ค์ผ์ค๋ฌ ์ถ๊ฐํ๊ธฐ [[9-adding-pipelines-models-schedulers]]
ํ์ดํ๋ผ์ธ, ๋ชจ๋ธ, ์ค์ผ์ค๋ฌ๋ Diffusers ๋ผ์ด๋ธ๋ฌ๋ฆฌ์์ ๊ฐ์ฅ ์ค์ํ ๋ถ๋ถ์
๋๋ค.
์ด๋ค์ ์ต์ฒจ๋จ diffusion ๊ธฐ์ ์ ์ฝ๊ฒ ์ ๊ทผํ๋๋ก ํ๋ฉฐ, ๋ฐ๋ผ์ ์ปค๋ฎค๋ํฐ๊ฐ ๊ฐ๋ ฅํ ์์ฑํ AI ์ ํ๋ฆฌ์ผ์ด์
์ ๋ง๋ค ์ ์๋๋ก ํฉ๋๋ค.
@@ -323,9 +322,9 @@ PR์ ์๋ณธ ์ฝ๋๋ฒ ์ด์ค/๋
ผ๋ฌธ ๋งํฌ๋ฅผ ์ถ๊ฐํ๊ณ , ๊ฐ๋ฅํ๋ฉด PR์
PR์์ ๋งํ ๊ฒฝ์ฐ๋ ๋์์ด ํ์ํ ๊ฒฝ์ฐ, ์ฒซ ๋ฒ์งธ ๋ฆฌ๋ทฐ๋ ๋์์ ์์ฒญํ๋ ๋ฉ์์ง๋ฅผ ๋จ๊ธฐ๋ ๊ฒ์ ์ฃผ์ ํ์ง ๋ง์ธ์.
-#### Copied from mechanism
+#### Copied from mechanism [[copied-from-mechanism]]
-`# Copied from mechanism` ์ ํ์ดํ๋ผ์ธ, ๋ชจ๋ธ ๋๋ ์ค์ผ์ค๋ฌ ์ฝ๋๋ฅผ ์ถ๊ฐํ ๋ ์ดํดํด์ผ ํ ๋
ํนํ๊ณ ์ค์ํ ๊ธฐ๋ฅ์
๋๋ค. Diffusers ์ฝ๋๋ฒ ์ด์ค ์ ์ฒด์์ ์ด๋ฅผ ์์ฃผ ๋ณผ ์ ์๋๋ฐ, ์ด๋ฅผ ์ฌ์ฉํ๋ ์ด์ ๋ ์ฝ๋๋ฒ ์ด์ค๋ฅผ ์ดํดํ๊ธฐ ์ฝ๊ณ ์ ์ง ๊ด๋ฆฌํ๊ธฐ ์ฝ๊ฒ ์ ์งํ๊ธฐ ์ํจ์
๋๋ค. `# Copied from mechanism` ์ผ๋ก ํ์๋ ์ฝ๋๋ ๋ณต์ฌํ ์ฝ๋์ ์ ํํ ๋์ผํ๋๋ก ๊ฐ์ ๋ฉ๋๋ค. ์ด๋ฅผ ํตํด `make fix-copies`๋ฅผ ์คํํ ๋ ๋ง์ ํ์ผ์ ๊ฑธ์ณ ๋ณ๊ฒฝ ์ฌํญ์ ์ฝ๊ฒ ์
๋ฐ์ดํธํ๊ณ ์ ํํ ์ ์์ต๋๋ค.
+`# Copied from mechanism` ์ ํ์ดํ๋ผ์ธ, ๋ชจ๋ธ ๋๋ ์ค์ผ์ค๋ฌ ์ฝ๋๋ฅผ ์ถ๊ฐํ ๋ ์ดํดํด์ผ ํ ๋
ํนํ๊ณ ์ค์ํ ๊ธฐ๋ฅ์
๋๋ค. ์ด๊ฒ์ Diffusers ์ฝ๋๋ฒ ์ด์ค ์ ๋ฐ์์ ๋ณผ ์ ์์ผ๋ฉฐ, ์ด๋ฅผ ์ฌ์ฉํ๋ ์ด์ ๋ ์ฝ๋๋ฒ ์ด์ค๋ฅผ ์ดํดํ๊ณ ์ ์ง ๊ด๋ฆฌํ๊ธฐ ์ฝ๊ฒ ๋ง๋ค๊ธฐ ์ํด์์
๋๋ค. `# Copied from mechanism` ์ผ๋ก ํ์๋ ์ฝ๋๋ ๋ณต์ฌํ ์ฝ๋์ ์ ํํ ๋์ผํ๋๋ก ๊ฐ์ ๋ฉ๋๋ค. ์ด๋ ๊ฒ ํ๋ฉด `make fix-copies`๋ฅผ ์คํํ ๋๋ง๋ค ์ฌ๋ฌ ํ์ผ์ ๊ฑธ์ณ ๋ณ๊ฒฝ ์ฌํญ์ ์ฝ๊ฒ ์
๋ฐ์ดํธํ๊ณ ์ ํํ ์ ์์ต๋๋ค.
์๋ฅผ ๋ค์ด, ์๋ ์ฝ๋ ์์ ์์ [`~diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput`]์ ์๋ ์ฝ๋์ด๋ฉฐ, `AltDiffusionPipelineOutput`์ `# Copied from mechanism`์ ์ฌ์ฉํ์ฌ ๋ณต์ฌํฉ๋๋ค. ์ ์ผํ ์ฐจ์ด์ ์ ํด๋์ค ์ ๋์ฌ๋ฅผ `Stable`์์ `Alt`๋ก ๋ณ๊ฒฝํ ๊ฒ์
๋๋ค.
@@ -347,7 +346,7 @@ class AltDiffusionPipelineOutput(BaseOutput):
๋ ์์ธํ ์๊ณ ์ถ๋ค๋ฉด [~Don't~ Repeat Yourself*](https://huggingface.co/blog/transformers-design-philosophy#4-machine-learning-models-are-static) ๋ธ๋ก๊ทธ ํฌ์คํธ์ ์ด ์น์
์ ์ฝ์ด๋ณด์ธ์.
-## ์ข์ ์ด์ ์์ฑ ๋ฐฉ๋ฒ
+## ์ข์ ์ด์ ์์ฑ ๋ฐฉ๋ฒ [[how-to-write-a-good-issue]]
**์ด์๋ฅผ ์ ์์ฑํ ์๋ก ๋น ๋ฅด๊ฒ ํด๊ฒฐ๋ ๊ฐ๋ฅ์ฑ์ด ๋์์ง๋๋ค.**
@@ -356,16 +355,16 @@ class AltDiffusionPipelineOutput(BaseOutput):
3. **์ฌํ ๊ฐ๋ฅ์ฑ**: ์ฌํ ๊ฐ๋ฅํ ์ฝ๋ ์กฐ๊ฐ์ด ์์ผ๋ฉด ํด๊ฒฐํ ์ ์์ต๋๋ค. ๋ฒ๊ทธ๋ฅผ ๋ฐ๊ฒฌํ ๊ฒฝ์ฐ, ์ ์ง ๊ด๋ฆฌ์๋ ๊ทธ ๋ฒ๊ทธ๋ฅผ ์ฌํํ ์ ์์ด์ผ ํฉ๋๋ค. ์ด์์ ์ฌํ ๊ฐ๋ฅํ ์ฝ๋ ์กฐ๊ฐ์ ํฌํจํด์ผ ํฉ๋๋ค. ์ฝ๋ ์กฐ๊ฐ์ Python ์ธํฐํ๋ฆฌํฐ์ ๋ณต์ฌํ์ฌ ๋ถ์ฌ๋ฃ์ ์ ์๋ ํํ์ฌ์ผ ํฉ๋๋ค. ์ฝ๋ ์กฐ๊ฐ์ด ์๋ํด์ผ ํฉ๋๋ค. ์ฆ, ๋๋ฝ๋ import๋ ์ด๋ฏธ์ง์ ๋ํ ๋งํฌ๊ฐ ์์ด์ผ ํฉ๋๋ค. ์ด์์๋ ์ค๋ฅ ๋ฉ์์ง์ ์ ํํ ๋์ผํ ์ค๋ฅ ๋ฉ์์ง๋ฅผ ์ฌํํ๊ธฐ ์ํด ์์ ํ์ง ์๊ณ ๋ณต์ฌํ์ฌ ๋ถ์ฌ๋ฃ์ ์ ์๋ ์ฝ๋ ์กฐ๊ฐ์ด ํฌํจ๋์ด์ผ ํฉ๋๋ค. ์ด์์ ์ฌ์ฉ์์ ๋ก์ปฌ ๋ชจ๋ธ ๊ฐ์ค์น๋ ๋ก์ปฌ ๋ฐ์ดํฐ๋ฅผ ์ฌ์ฉํ๋ ๊ฒฝ์ฐ, ๋
์๊ฐ ์ก์ธ์คํ ์ ์๋ ๊ฒฝ์ฐ ์ด์๋ฅผ ํด๊ฒฐํ ์ ์์ต๋๋ค. ๋ฐ์ดํฐ๋ ๋ชจ๋ธ์ ๊ณต์ ํ ์ ์๋ ๊ฒฝ์ฐ, ๋๋ฏธ ๋ชจ๋ธ์ด๋ ๋๋ฏธ ๋ฐ์ดํฐ๋ฅผ ๋ง๋ค์ด ์ฌ์ฉํด๋ณด์ธ์.
4. **๊ฐ๊ฒฐ์ฑ**: ๊ฐ๋ฅํ ํ ๊ฐ๊ฒฐํ๊ฒ ์ ์งํ์ฌ ๋
์๊ฐ ๋ฌธ์ ๋ฅผ ๋น ๋ฅด๊ฒ ์ดํดํ ์ ์๋๋ก ๋์์ฃผ์ธ์. ๋ฌธ์ ์ ๊ด๋ จ์ด ์๋ ์ฝ๋๋ ์ ๋ณด๋ ๋ชจ๋ ์ ๊ฑฐํด์ฃผ์ธ์. ๋ฒ๊ทธ๋ฅผ ๋ฐ๊ฒฌํ ๊ฒฝ์ฐ, ๋ฌธ์ ๋ฅผ ์ค๋ช
ํ๋ ๊ฐ์ฅ ๊ฐ๋จํ ์ฝ๋ ์์ ๋ฅผ ๋ง๋ค์ด๋ณด์ธ์. ๋ฒ๊ทธ๋ฅผ ๋ฐ๊ฒฌํ ํ์๋ ์์
ํ๋ฆ ์ ์ฒด๋ฅผ ๋ฌธ์ ์ ๋์ง๋ ๊ฒ์ด ์๋๋ผ, ์๋ฌ๊ฐ ๋ฐ์ํ๋ ํ๋ จ ์ฝ๋์ ์ด๋ ๋ถ๋ถ์ด ๋ฌธ์ ์ธ์ง ๋จผ์ ์ดํดํ๊ณ ๋ช ์ค๋ก ์ฌํํด๋ณด์ธ์. ์ ์ฒด ๋ฐ์ดํฐ์
๋์ ๋๋ฏธ ๋ฐ์ดํฐ๋ฅผ ์ฌ์ฉํด๋ณด์ธ์.
5. ๋งํฌ ์ถ๊ฐํ๊ธฐ. ํน์ ํ ์ด๋ฆ, ๋ฉ์๋, ๋๋ ๋ชจ๋ธ์ ์ฐธ์กฐํ๋ ๊ฒฝ์ฐ, ๋
์๊ฐ ๋ ์ ์ดํดํ ์ ์๋๋ก ๋งํฌ๋ฅผ ์ ๊ณตํด์ฃผ์ธ์. ํน์ PR์ด๋ ์ด์๋ฅผ ์ฐธ์กฐํ๋ ๊ฒฝ์ฐ, ํด๋น ์ด์์ ๋งํฌ๋ฅผ ๊ฑธ์ด์ฃผ์ธ์. ๋
์๊ฐ ๋ฌด์์ ๋งํ๋์ง ์๊ณ ์๋ค๊ณ ๊ฐ์ ํ์ง ๋ง์ธ์. ์ด์์ ๋งํฌ๋ฅผ ์ถ๊ฐํ ์๋ก ์ข์ต๋๋ค.
-6. ํฌ๋งทํ
. ํ์ด์ฌ ์ฝ๋ ๊ตฌ๋ฌธ์ผ๋ก ์ฝ๋๋ฅผ ํฌ๋งทํ
ํ๊ณ , ์ผ๋ฐ ์ฝ๋ ๊ตฌ๋ฌธ์ผ๋ก ์๋ฌ ๋ฉ์์ง๋ฅผ ํฌ๋งทํ
ํด์ฃผ์ธ์. ์์ธํ ๋ด์ฉ์ [๊ณต์ GitHub ํฌ๋งทํ
๋ฌธ์](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax)๋ฅผ ์ฐธ์กฐํ์ธ์.
-7. ์ด์๋ฅผ ํด๊ฒฐํด์ผ ํ๋ ํฐ์ผ์ด ์๋๋ผ, ์ ์์ฑ๋ ๋ฐฑ๊ณผ์ฌ์ ํญ๋ชฉ์ผ๋ก ์๊ฐํด๋ณด์ธ์. ์ถ๊ฐ๋ ์ด์๋ ๊ณต๊ฐ์ ์ผ๋ก ์ฌ์ฉ ๊ฐ๋ฅํ ์ง์์ ๊ธฐ์ฌํ๋ ๊ฒ์
๋๋ค. ์ ์์ฑ๋ ์ด์๋ฅผ ์ถ๊ฐํจ์ผ๋ก์จ ๋ฉ์ธํ
์ด๋๊ฐ ๋ฌธ์ ๋ฅผ ํด๊ฒฐํ๋ ๋ฐ ๋์์ ์ฃผ๋ ๊ฒ๋ฟ๋ง ์๋๋ผ, ์ ์ฒด ์ปค๋ฎค๋ํฐ๊ฐ ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ํน์ ์ธก๋ฉด์ ๋ ์ ์ดํดํ ์ ์๋๋ก ๋์์ ์ฃผ๋ ๊ฒ์
๋๋ค.
+6. ํฌ๋งทํ
. ์ฝ๋๋ฅผ ํ์ด์ฌ ์ฝ๋ ๊ตฌ๋ฌธ์ผ๋ก, ์๋ฌ ๋ฉ์์ง๋ฅผ ์ผ๋ฐ ์ฝ๋ ๊ตฌ๋ฌธ์ผ๋ก ํ์ํํ์ฌ ์ด์๋ฅผ ๊น๋ํ๊ฒ ์์ฑํ์ธ์. ์์ธํ ๋ด์ฉ์ [GitHub ๊ณต์ ํฌ๋งทํ
๋ฌธ์](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax)๋ฅผ ์ฐธ์กฐํ์ธ์.
+7. ์ฌ๋ฌ๋ถ์ ์ด์๋ฅผ ๋จ์ํ ํด๊ฒฐํด์ผ ํ ํฐ์ผ์ผ๋ก ์๊ฐํ์ง ๋ง๊ณ , ์ ์์ฑ๋ ๋ฐฑ๊ณผ์ฌ์ ํญ๋ชฉ์ผ๋ก ์๊ฐํด๋ณด์ธ์. ์ถ๊ฐ๋ ๋ชจ๋ ์ด์๋ ๊ณต๊ฐ์ ์ผ๋ก ์ด์ฉ ๊ฐ๋ฅํ ์ง์์ ๋ํ ๊ธฐ์ฌ์
๋๋ค. ์ ์์ฑ๋ ์ด์๋ฅผ ์ถ๊ฐํจ์ผ๋ก์จ ๋ฉ์ธํ
์ด๋๊ฐ ์ฌ๋ฌ๋ถ์ ์ด์๋ฅผ ๋ ์ฝ๊ฒ ํด๊ฒฐํ ์ ์๊ฒ ํ ๋ฟ๋ง ์๋๋ผ, ์ ์ฒด ์ปค๋ฎค๋ํฐ๊ฐ ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ํน์ ์ธก๋ฉด์ ๋ ์ ์ดํดํ ์ ์๋๋ก ๋์์ ์ฃผ๊ฒ ๋ฉ๋๋ค.
-## ์ข์ PR ์์ฑ ๋ฐฉ๋ฒ
+## ์ข์ PR ์์ฑ ๋ฐฉ๋ฒ [[how-to-write-a-good-pr]]
-1. ์นด๋ฉ๋ ์จ์ด ๋์ธ์. ๊ธฐ์กด์ ๋์์ธ ํจํด๊ณผ ๊ตฌ๋ฌธ์ ์ดํดํ๊ณ , ์ฝ๋ ์ถ๊ฐ๊ฐ ๊ธฐ์กด ์ฝ๋๋ฒ ์ด์ค์ ๋งค๋๋ฝ๊ฒ ํ๋ฅด๋๋ก ํด์ผ ํฉ๋๋ค. ๊ธฐ์กด ๋์์ธ ํจํด์ด๋ ์ฌ์ฉ์ ์ธํฐํ์ด์ค์ ํฌ๊ฒ ๋ค๋ฅธ PR์ ๋ณํฉ๋์ง ์์ต๋๋ค.
-2. ์ด์ ์ ๋ง์ถ์ธ์. ํ๋์ ๋ฌธ์ ๋ง ํด๊ฒฐํ๋ PR์ ์์ฑํด์ผ ํฉ๋๋ค. "์ถ๊ฐํ๋ฉด์ ๋ค๋ฅธ ๋ฌธ์ ๋ ํด๊ฒฐํ๊ธฐ"์ ๋น ์ง์ง ์๋๋ก ์ฃผ์ํ์ธ์. ์ฌ๋ฌ ๊ฐ์ ๊ด๋ จ ์๋ ๋ฌธ์ ๋ฅผ ํด๊ฒฐํ๋ PR์ ์์ฑํ๋ ๊ฒ์ ๋ฆฌ๋ทฐํ๊ธฐ๊ฐ ํจ์ฌ ์ด๋ ต์ต๋๋ค.
+1. ์นด๋ฉ๋ ์จ์ด ๋์ธ์. ๊ธฐ์กด์ ๋์์ธ ํจํด๊ณผ ๊ตฌ๋ฌธ์ ์ดํดํ๊ณ , ์ฌ๋ฌ๋ถ์ด ์ถ๊ฐํ๋ ์ฝ๋๊ฐ ๊ธฐ์กด ์ฝ๋๋ฒ ์ด์ค์ ์์ฐ์ค๋ฝ๊ฒ ์ด์ฐ๋ฌ์ง๋๋ก ํด์ผ ํฉ๋๋ค. ๊ธฐ์กด ๋์์ธ ํจํด์ด๋ ์ฌ์ฉ์ ์ธํฐํ์ด์ค์ ํฌ๊ฒ ๋ค๋ฅธ Pull Request๋ค์ ๋ณํฉ๋์ง ์์ต๋๋ค.
+2. ๋ ์ด์ ์ฒ๋ผ ์ง์คํ์ธ์. Pull Request๋ ํ๋์ ๋ฌธ์ , ์ค์ง ํ๋์ ๋ฌธ์ ๋ง ํด๊ฒฐํด์ผ ํฉ๋๋ค. "์ด์ ์ถ๊ฐํ๋ ๊น์ ๋ค๋ฅธ ๋ฌธ์ ๋ ๊ณ ์น์"๋ ํจ์ ์ ๋น ์ง์ง ์๋๋ก ์ฃผ์ํ์ธ์. ์ฌ๋ฌ ๊ฐ์ ๊ด๋ จ ์๋ ๋ฌธ์ ๋ฅผ ํด๊ฒฐํ๋ ํ ๋ฒ์ ํด๊ฒฐํ๋ Pull Request๋ค์ ๊ฒํ ํ๊ธฐ๊ฐ ํจ์ฌ ๋ ์ด๋ ต์ต๋๋ค.
3. ๋์์ด ๋๋ ๊ฒฝ์ฐ, ์ถ๊ฐํ ๋ด์ฉ์ด ์ด๋ป๊ฒ ์ฌ์ฉ๋๋์ง ์์ ์ฝ๋ ์กฐ๊ฐ์ ์ถ๊ฐํด๋ณด์ธ์.
-4. PR์ ์ ๋ชฉ์ ๊ธฐ์ฌ ๋ด์ฉ์ ์์ฝํด์ผ ํฉ๋๋ค.
-5. PR์ด ์ด์๋ฅผ ํด๊ฒฐํ๋ ๊ฒฝ์ฐ, PR ์ค๋ช
์ ์ด์ ๋ฒํธ๋ฅผ ์ธ๊ธํ์ฌ ์ฐ๊ฒฐ๋๋๋ก ํด์ฃผ์ธ์ (์ด์๋ฅผ ์ฐธ์กฐํ๋ ์ฌ๋๋ค์ด ์์
์ค์์ ์ ์ ์๋๋ก).
+4. Pull Request์ ์ ๋ชฉ์ ๊ธฐ์ฌ ๋ด์ฉ์ ์์ฝํด์ผ ํฉ๋๋ค.
+5. Pull Request๊ฐ ์ด์๋ฅผ ํด๊ฒฐํ๋ ๊ฒฝ์ฐ, Pull Request์ ์ค๋ช
์ ์ด์ ๋ฒํธ๋ฅผ ์ธ๊ธํ์ฌ ์ฐ๊ฒฐ๋๋๋ก ํด์ฃผ์ธ์ (์ด์๋ฅผ ์ฐธ์กฐํ๋ ์ฌ๋๋ค์ด ์์
์ค์์ ์ ์ ์๋๋ก).
6. ์งํ ์ค์ธ ์์
์ ๋ํ๋ด๋ ค๋ฉด ์ ๋ชฉ์ `[WIP]`๋ฅผ ์ ๋์ฌ๋ก ๋ถ์ฌ์ฃผ์ธ์. ์ด๋ ์ค๋ณต ์์
์ ํผํ๊ณ , ๋ณํฉ ์ค๋น๊ฐ ๋ PR๊ณผ ๊ตฌ๋ถํ ์ ์๋๋ก ๋์์ด ๋ฉ๋๋ค.
7. [์ข์ ์ด์๋ฅผ ์์ฑํ๋ ๋ฐฉ๋ฒ](#how-to-write-a-good-issue)์ ์ค๋ช
๋ ๋๋ก ํ
์คํธ๋ฅผ ๊ตฌ์ฑํ๊ณ ํ์์ ์ง์ ํด๋ณด์ธ์.
8. ๊ธฐ์กด ํ
์คํธ๊ฐ ํต๊ณผํ๋์ง ํ์ธํ์ธ์
@@ -374,10 +373,10 @@ class AltDiffusionPipelineOutput(BaseOutput):
`RUN_SLOW=1 python -m pytest tests/test_my_new_model.py`.
CircleCI๋ ๋๋ฆฐ ํ
์คํธ๋ฅผ ์คํํ์ง ์์ง๋ง, GitHub Actions๋ ๋งค์ผ ์คํํฉ๋๋ค!
10. ๋ชจ๋ ๊ณต๊ฐ ๋ฉ์๋๋ ๋งํฌ๋ค์ด๊ณผ ์ ์๋ํ๋ ์ ๋ณด์ฑ docstring์ ๊ฐ์ ธ์ผ ํฉ๋๋ค. ์์๋ก [`pipeline_latent_diffusion.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py)๋ฅผ ์ฐธ์กฐํ์ธ์.
-11. ๋ ํฌ์งํ ๋ฆฌ๊ฐ ๋น ๋ฅด๊ฒ ์ฑ์ฅํ๊ณ ์๊ธฐ ๋๋ฌธ์, ๋ ํฌ์งํ ๋ฆฌ์ ํฐ ๋ถ๋ด์ ์ฃผ๋ ํ์ผ์ด ์ถ๊ฐ๋์ง ์๋๋ก ์ฃผ์ํด์ผ ํฉ๋๋ค. ์ด๋ฏธ์ง, ๋น๋์ค ๋ฐ ๊ธฐํ ํ
์คํธ๊ฐ ์๋ ํ์ผ์ ํฌํจํฉ๋๋ค. ์ด๋ฌํ ํ์ผ์ ๋ฐฐ์นํ๊ธฐ ์ํด hf.co ํธ์คํ
`dataset`์ธ [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) ๋๋ [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images)๋ฅผ ํ์ฉํ๋ ๊ฒ์ด ์ฐ์ ์
๋๋ค.
+11. ๋ฆฌํฌ์งํ ๋ฆฌ๊ฐ ๋น ๋ฅด๊ฒ ์ฑ์ฅํ๊ณ ์๊ธฐ ๋๋ฌธ์, ๋ฆฌํฌ์งํ ๋ฆฌ์ ํฐ ๋ถ๋ด์ ์ฃผ๋ ํ์ผ์ด ์ถ๊ฐ๋์ง ์๋๋ก ์ฃผ์ํด์ผ ํฉ๋๋ค. ์ด๋ฏธ์ง, ๋น๋์ค ๋ฐ ๊ธฐํ ํ
์คํธ๊ฐ ์๋ ํ์ผ์ ํฌํจํฉ๋๋ค. ์ด๋ฌํ ํ์ผ์ ๋ฐฐ์นํ๊ธฐ ์ํด hf.co ํธ์คํ
`dataset`์ธ [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) ๋๋ [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images)๋ฅผ ํ์ฉํ๋ ๊ฒ์ด ์ฐ์ ์
๋๋ค.
์ธ๋ถ ๊ธฐ์ฌ์ธ ๊ฒฝ์ฐ, ์ด๋ฏธ์ง๋ฅผ PR์ ์ถ๊ฐํ๊ณ Hugging Face ๊ตฌ์ฑ์์๊ฒ ์ด๋ฏธ์ง๋ฅผ ์ด ๋ฐ์ดํฐ์
์ผ๋ก ์ด๋ํ๋๋ก ์์ฒญํ์ธ์.
-## PR์ ์ด๊ธฐ ์ํ ๋ฐฉ๋ฒ
+## PR์ ์ด๊ธฐ ์ํ ๋ฐฉ๋ฒ [[how-to-open-a-pr]]
์ฝ๋๋ฅผ ์์ฑํ๊ธฐ ์ ์, ์ด๋ฏธ ๋๊ตฐ๊ฐ๊ฐ ๊ฐ์ ์์
์ ํ๊ณ ์๋์ง ํ์ธํ๊ธฐ ์ํด ๊ธฐ์กด์ PR์ด๋ ์ด์๋ฅผ ๊ฒ์ํ๋ ๊ฒ์ด ์ข์ต๋๋ค. ํ์คํ์ง ์์ ๊ฒฝ์ฐ, ํผ๋๋ฐฑ์ ๋ฐ๊ธฐ ์ํด ์ด์๋ฅผ ์ด์ด๋ณด๋ ๊ฒ์ด ํญ์ ์ข์ ์์ด๋์ด์
๋๋ค.
@@ -403,7 +402,7 @@ CircleCI๋ ๋๋ฆฐ ํ
์คํธ๋ฅผ ์คํํ์ง ์์ง๋ง, GitHub Actions๋ ๋งค์ผ
`main` ๋ธ๋์น ์์์ **์ ๋** ์์
ํ์ง ๋ง์ธ์.
-1. ๊ฐ์ ํ๊ฒฝ์์ ๋ค์ ๋ช
๋ น์ ์คํํ์ฌ ๊ฐ๋ฐ ํ๊ฒฝ์ ์ค์ ํ์ธ์:
+4. ๊ฐ์ ํ๊ฒฝ์์ ๋ค์ ๋ช
๋ น์ ์คํํ์ฌ ๊ฐ๋ฐ ํ๊ฒฝ์ ์ค์ ํ์ธ์:
```bash
$ pip install -e ".[dev]"
@@ -467,7 +466,7 @@ CircleCI๋ ๋๋ฆฐ ํ
์คํธ๋ฅผ ์คํํ์ง ์์ง๋ง, GitHub Actions๋ ๋งค์ผ
7. ๋ฉ์ธํ
์ด๋๊ฐ ๋ณ๊ฒฝ ์ฌํญ์ ์์ฒญํ๋ ๊ฒ์ ๊ด์ฐฎ์ต๋๋ค. ํต์ฌ ๊ธฐ์ฌ์๋ค์๊ฒ๋ ์ผ์ด๋๋ ์ผ์
๋๋ค! ๋ฐ๋ผ์ ๋ณ๊ฒฝ ์ฌํญ์ Pull request์์ ๋ณผ ์ ์๋๋ก ๋ก์ปฌ ๋ธ๋์น์์ ์์
ํ๊ณ ๋ณ๊ฒฝ ์ฌํญ์ ํฌํฌ์ ํธ์ํ๋ฉด ์๋์ผ๋ก Pull request์ ๋ํ๋ฉ๋๋ค.
-### ํ
์คํธ
+### ํ
์คํธ [[tests]]
๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋์๊ณผ ์ฌ๋ฌ ์์ ๋ฅผ ํ
์คํธํ๊ธฐ ์ํด ํฌ๊ด์ ์ธ ํ
์คํธ ๋ฌถ์์ด ํฌํจ๋์ด ์์ต๋๋ค. ๋ผ์ด๋ธ๋ฌ๋ฆฌ ํ
์คํธ๋ [tests ํด๋](https://github.com/huggingface/diffusers/tree/main/tests)์์ ์ฐพ์ ์ ์์ต๋๋ค.
@@ -494,7 +493,7 @@ $ python -m unittest discover -s tests -t . -v
$ python -m unittest discover -s examples -t examples -v
```
-### upstream(main)๊ณผ forked main ๋๊ธฐํํ๊ธฐ
+### upstream(HuggingFace) main๊ณผ forked main ๋๊ธฐํํ๊ธฐ [[syncing-forked-main-with-upstream-huggingface-main]]
upstream ์ ์ฅ์์ ๋ถํ์ํ ์ฐธ์กฐ ๋
ธํธ๋ฅผ ์ถ๊ฐํ๊ณ ๊ด๋ จ ๊ฐ๋ฐ์์๊ฒ ์๋ฆผ์ ๋ณด๋ด๋ ๊ฒ์ ํผํ๊ธฐ ์ํด,
forked ์ ์ฅ์์ main ๋ธ๋์น๋ฅผ ๋๊ธฐํํ ๋ ๋ค์ ๋จ๊ณ๋ฅผ ๋ฐ๋ฅด์ธ์:
@@ -507,6 +506,6 @@ $ git commit -m ''
$ git push --set-upstream origin your-branch-for-syncing
```
-### ์คํ์ผ ๊ฐ์ด๋
+### ์คํ์ผ ๊ฐ์ด๋ [[style-guide]]
Documentation string์ ๋ํด์๋, ๐งจ Diffusers๋ [Google ์คํ์ผ](https://google.github.io/styleguide/pyguide.html)์ ๋ฐ๋ฆ
๋๋ค.
diff --git a/examples/dreambooth/README_flux.md b/examples/dreambooth/README_flux.md
index fab382c0894c..952d86a1f2f0 100644
--- a/examples/dreambooth/README_flux.md
+++ b/examples/dreambooth/README_flux.md
@@ -3,17 +3,17 @@
[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few (3~5) images of a subject.
The `train_dreambooth_flux.py` script shows how to implement the training procedure and adapt it for [FLUX.1 [dev]](https://blackforestlabs.ai/announcing-black-forest-labs/). We also provide a LoRA implementation in the `train_dreambooth_lora_flux.py` script.
-> [!NOTE]
+> [!NOTE]
> **Memory consumption**
->
-> Flux can be quite expensive to run on consumer hardware devices and as a result finetuning it comes with high memory requirements -
+>
+> Flux can be quite expensive to run on consumer hardware devices and as a result finetuning it comes with high memory requirements -
> a LoRA with a rank of 16 (w/ all components trained) can exceed 40GB of VRAM for training.
-> For more tips & guidance on training on a resource-constrained device please visit [`@bghira`'s guide](https://github.com/bghira/SimpleTuner/blob/main/documentation/quickstart/FLUX.md)
+> For more tips & guidance on training on a resource-constrained device please visit [`@bghira`'s guide](https://github.com/bghira/SimpleTuner/blob/main/documentation/quickstart/FLUX.md)
> [!NOTE]
> **Gated model**
->
+>
> As the model is gated, before using it with diffusers you first need to go to the [FLUX.1 [dev] Hugging Face page](https://huggingface.co/black-forest-labs/FLUX.1-dev), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows youโve accepted the gate. Use the command below to log in:
```bash
@@ -163,7 +163,7 @@ To do so, just specify `--train_text_encoder` while launching training. Please k
> [!NOTE]
> FLUX.1 has 2 text encoders (CLIP L/14 and T5-v1.1-XXL).
-By enabling `--train_text_encoder`, fine-tuning of the **CLIP encoder** is performed.
+By enabling `--train_text_encoder`, fine-tuning of the **CLIP encoder** is performed.
> At the moment, T5 fine-tuning is not supported and weights remain frozen when text encoder training is enabled.
To perform DreamBooth LoRA with text-encoder training, run:
diff --git a/examples/dreambooth/train_dreambooth_lora_sd3.py b/examples/dreambooth/train_dreambooth_lora_sd3.py
index 163031968f5b..2e77cb946f92 100644
--- a/examples/dreambooth/train_dreambooth_lora_sd3.py
+++ b/examples/dreambooth/train_dreambooth_lora_sd3.py
@@ -1454,7 +1454,7 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
)
# Clear the memory here
- if not args.train_text_encoder and train_dataset.custom_instance_prompts:
+ if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
del tokenizers, text_encoders
# Explicitly delete the objects as well, otherwise only the lists are deleted and the original references remain, preventing garbage collection
del text_encoder_one, text_encoder_two, text_encoder_three
diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py
index 7f4917b5464c..2ca511c857ae 100644
--- a/examples/text_to_image/train_text_to_image_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_sdxl.py
@@ -1084,7 +1084,7 @@ def unwrap_model(model):
# Add noise to the model input according to the noise magnitude at each timestep
# (this is the forward diffusion process)
- noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+ noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps).to(dtype=weight_dtype)
# time ids
def compute_time_ids(original_size, crops_coords_top_left):
@@ -1101,7 +1101,7 @@ def compute_time_ids(original_size, crops_coords_top_left):
# Predict the noise residual
unet_added_conditions = {"time_ids": add_time_ids}
- prompt_embeds = batch["prompt_embeds"].to(accelerator.device)
+ prompt_embeds = batch["prompt_embeds"].to(accelerator.device, dtype=weight_dtype)
pooled_prompt_embeds = batch["pooled_prompt_embeds"].to(accelerator.device)
unet_added_conditions.update({"text_embeds": pooled_prompt_embeds})
model_pred = unet(
diff --git a/examples/textual_inversion/README.md b/examples/textual_inversion/README.md
index 9e3a622943a1..3287f698b870 100644
--- a/examples/textual_inversion/README.md
+++ b/examples/textual_inversion/README.md
@@ -109,6 +109,9 @@ import torch
model_id = "path-to-your-trained-model"
pipe = StableDiffusionPipeline.from_pretrained(model_id,torch_dtype=torch.float16).to("cuda")
+repo_id_embeds = "path-to-your-learned-embeds"
+pipe.load_textual_inversion(repo_id_embeds)
+
prompt = "A backpack"
image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
diff --git a/scripts/convert_cogvideox_to_diffusers.py b/scripts/convert_cogvideox_to_diffusers.py
index c03013a7fff9..6448da7f1131 100644
--- a/scripts/convert_cogvideox_to_diffusers.py
+++ b/scripts/convert_cogvideox_to_diffusers.py
@@ -86,6 +86,9 @@ def replace_up_keys_inplace(key: str, state_dict: Dict[str, Any]):
"key_layernorm_list": reassign_query_key_layernorm_inplace,
"adaln_layer.adaLN_modulations": reassign_adaln_norm_inplace,
"embed_tokens": remove_keys_inplace,
+ "freqs_sin": remove_keys_inplace,
+ "freqs_cos": remove_keys_inplace,
+ "position_embedding": remove_keys_inplace,
}
VAE_KEYS_RENAME_DICT = {
@@ -123,11 +126,21 @@ def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, new_key:
state_dict[new_key] = state_dict.pop(old_key)
-def convert_transformer(ckpt_path: str):
+def convert_transformer(
+ ckpt_path: str,
+ num_layers: int,
+ num_attention_heads: int,
+ use_rotary_positional_embeddings: bool,
+ dtype: torch.dtype,
+):
PREFIX_KEY = "model.diffusion_model."
original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True))
- transformer = CogVideoXTransformer3DModel()
+ transformer = CogVideoXTransformer3DModel(
+ num_layers=num_layers,
+ num_attention_heads=num_attention_heads,
+ use_rotary_positional_embeddings=use_rotary_positional_embeddings,
+ ).to(dtype=dtype)
for key in list(original_state_dict.keys()):
new_key = key[len(PREFIX_KEY) :]
@@ -145,9 +158,9 @@ def convert_transformer(ckpt_path: str):
return transformer
-def convert_vae(ckpt_path: str):
+def convert_vae(ckpt_path: str, scaling_factor: float, dtype: torch.dtype):
original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True))
- vae = AutoencoderKLCogVideoX()
+ vae = AutoencoderKLCogVideoX(scaling_factor=scaling_factor).to(dtype=dtype)
for key in list(original_state_dict.keys()):
new_key = key[:]
@@ -172,13 +185,26 @@ def get_args():
)
parser.add_argument("--vae_ckpt_path", type=str, default=None, help="Path to original vae checkpoint")
parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
- parser.add_argument("--fp16", action="store_true", default=True, help="Whether to save the model weights in fp16")
+ parser.add_argument("--fp16", action="store_true", default=False, help="Whether to save the model weights in fp16")
+ parser.add_argument("--bf16", action="store_true", default=False, help="Whether to save the model weights in bf16")
parser.add_argument(
"--push_to_hub", action="store_true", default=False, help="Whether to push to HF Hub after saving"
)
parser.add_argument(
"--text_encoder_cache_dir", type=str, default=None, help="Path to text encoder cache directory"
)
+ # For CogVideoX-2B, num_layers is 30. For 5B, it is 42
+ parser.add_argument("--num_layers", type=int, default=30, help="Number of transformer blocks")
+ # For CogVideoX-2B, num_attention_heads is 30. For 5B, it is 48
+ parser.add_argument("--num_attention_heads", type=int, default=30, help="Number of attention heads")
+ # For CogVideoX-2B, use_rotary_positional_embeddings is False. For 5B, it is True
+ parser.add_argument(
+ "--use_rotary_positional_embeddings", action="store_true", default=False, help="Whether to use RoPE or not"
+ )
+ # For CogVideoX-2B, scaling_factor is 1.15258426. For 5B, it is 0.7
+ parser.add_argument("--scaling_factor", type=float, default=1.15258426, help="Scaling factor in the VAE")
+ # For CogVideoX-2B, snr_shift_scale is 3.0. For 5B, it is 1.0
+ parser.add_argument("--snr_shift_scale", type=float, default=3.0, help="Scaling factor in the VAE")
return parser.parse_args()
@@ -188,18 +214,33 @@ def get_args():
transformer = None
vae = None
+ if args.fp16 and args.bf16:
+ raise ValueError("You cannot pass both --fp16 and --bf16 at the same time.")
+
+ dtype = torch.float16 if args.fp16 else torch.bfloat16 if args.bf16 else torch.float32
+
if args.transformer_ckpt_path is not None:
- transformer = convert_transformer(args.transformer_ckpt_path)
+ transformer = convert_transformer(
+ args.transformer_ckpt_path,
+ args.num_layers,
+ args.num_attention_heads,
+ args.use_rotary_positional_embeddings,
+ dtype,
+ )
if args.vae_ckpt_path is not None:
- vae = convert_vae(args.vae_ckpt_path)
+ vae = convert_vae(args.vae_ckpt_path, args.scaling_factor, dtype)
text_encoder_id = "google/t5-v1_1-xxl"
tokenizer = T5Tokenizer.from_pretrained(text_encoder_id, model_max_length=TOKENIZER_MAX_LENGTH)
text_encoder = T5EncoderModel.from_pretrained(text_encoder_id, cache_dir=args.text_encoder_cache_dir)
+ # Apparently, the conversion does not work any more without this :shrug:
+ for param in text_encoder.parameters():
+ param.data = param.data.contiguous()
+
scheduler = CogVideoXDDIMScheduler.from_config(
{
- "snr_shift_scale": 3.0,
+ "snr_shift_scale": args.snr_shift_scale,
"beta_end": 0.012,
"beta_schedule": "scaled_linear",
"beta_start": 0.00085,
@@ -208,7 +249,7 @@ def get_args():
"prediction_type": "v_prediction",
"rescale_betas_zero_snr": True,
"set_alpha_to_one": True,
- "timestep_spacing": "linspace",
+ "timestep_spacing": "trailing",
}
)
@@ -218,5 +259,10 @@ def get_args():
if args.fp16:
pipe = pipe.to(dtype=torch.float16)
+ if args.bf16:
+ pipe = pipe.to(dtype=torch.bfloat16)
+ # We don't use variant here because the model must be run in fp16 (2B) or bf16 (5B). It would be weird
+ # for users to specify variant when the default is not fp32 and they want to run with the correct default (which
+ # is either fp16/bf16 here).
pipe.save_pretrained(args.output_path, safe_serialization=True, push_to_hub=args.push_to_hub)
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 33be71967dec..650542c124d5 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -346,6 +346,7 @@
"StableDiffusionXLAdapterPipeline",
"StableDiffusionXLControlNetImg2ImgPipeline",
"StableDiffusionXLControlNetInpaintPipeline",
+ "StableDiffusionXLControlNetPAGImg2ImgPipeline",
"StableDiffusionXLControlNetPAGPipeline",
"StableDiffusionXLControlNetPipeline",
"StableDiffusionXLControlNetXSPipeline",
@@ -787,6 +788,7 @@
StableDiffusionXLAdapterPipeline,
StableDiffusionXLControlNetImg2ImgPipeline,
StableDiffusionXLControlNetInpaintPipeline,
+ StableDiffusionXLControlNetPAGImg2ImgPipeline,
StableDiffusionXLControlNetPAGPipeline,
StableDiffusionXLControlNetPipeline,
StableDiffusionXLControlNetXSPipeline,
diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py
index 44c8c0a5181c..f2433081018e 100644
--- a/src/diffusers/loaders/ip_adapter.py
+++ b/src/diffusers/loaders/ip_adapter.py
@@ -222,7 +222,11 @@ def load_ip_adapter(
# create feature extractor if it has not been registered to the pipeline yet
if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
- clip_image_size = self.image_encoder.config.image_size
+ # FaceID IP adapters don't need the image encoder so it's not present, in this case we default to 224
+ default_clip_size = 224
+ clip_image_size = (
+ self.image_encoder.config.image_size if self.image_encoder is not None else default_clip_size
+ )
feature_extractor = CLIPImageProcessor(size=clip_image_size, crop_size=clip_image_size)
self.register_modules(feature_extractor=feature_extractor)
diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py
index f612cc0c6e53..cefe66bc8cb6 100644
--- a/src/diffusers/loaders/lora_pipeline.py
+++ b/src/diffusers/loaders/lora_pipeline.py
@@ -280,7 +280,9 @@ def load_lora_into_text_encoder(
A standard state dict containing the lora layer parameters. The key should be prefixed with an
additional `text_encoder` to distinguish between unet lora layers.
network_alphas (`Dict[str, float]`):
- See `LoRALinearLayer` for more details.
+ The value of the network alpha used for stable learning and preventing underflow. This value has the
+ same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+ link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
text_encoder (`CLIPTextModel`):
The text encoder model to load the LoRA layers into.
prefix (`str`):
@@ -753,7 +755,9 @@ def load_lora_into_text_encoder(
A standard state dict containing the lora layer parameters. The key should be prefixed with an
additional `text_encoder` to distinguish between unet lora layers.
network_alphas (`Dict[str, float]`):
- See `LoRALinearLayer` for more details.
+ The value of the network alpha used for stable learning and preventing underflow. This value has the
+ same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+ link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
text_encoder (`CLIPTextModel`):
The text encoder model to load the LoRA layers into.
prefix (`str`):
@@ -1249,7 +1253,9 @@ def load_lora_into_text_encoder(
A standard state dict containing the lora layer parameters. The key should be prefixed with an
additional `text_encoder` to distinguish between unet lora layers.
network_alphas (`Dict[str, float]`):
- See `LoRALinearLayer` for more details.
+ The value of the network alpha used for stable learning and preventing underflow. This value has the
+ same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+ link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
text_encoder (`CLIPTextModel`):
The text encoder model to load the LoRA layers into.
prefix (`str`):
@@ -1489,10 +1495,10 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
@classmethod
@validate_hf_hub_args
- # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.lora_state_dict
def lora_state_dict(
cls,
pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+ return_alphas: bool = False,
**kwargs,
):
r"""
@@ -1577,7 +1583,26 @@ def lora_state_dict(
allow_pickle=allow_pickle,
)
- return state_dict
+ # For state dicts like
+ # https://huggingface.co/TheLastBen/Jon_Snow_Flux_LoRA
+ keys = list(state_dict.keys())
+ network_alphas = {}
+ for k in keys:
+ if "alpha" in k:
+ alpha_value = state_dict.get(k)
+ if (torch.is_tensor(alpha_value) and torch.is_floating_point(alpha_value)) or isinstance(
+ alpha_value, float
+ ):
+ network_alphas[k] = state_dict.pop(k)
+ else:
+ raise ValueError(
+ f"The alpha key ({k}) seems to be incorrect. If you think this error is unexpected, please open as issue."
+ )
+
+ if return_alphas:
+ return state_dict, network_alphas
+ else:
+ return state_dict
def load_lora_weights(
self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs
@@ -1611,7 +1636,9 @@ def load_lora_weights(
pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
# First, ensure that the checkpoint is a compatible one and can be successfully loaded.
- state_dict = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
+ state_dict, network_alphas = self.lora_state_dict(
+ pretrained_model_name_or_path_or_dict, return_alphas=True, **kwargs
+ )
is_correct_format = all("lora" in key or "dora_scale" in key for key in state_dict.keys())
if not is_correct_format:
@@ -1619,6 +1646,7 @@ def load_lora_weights(
self.load_lora_into_transformer(
state_dict,
+ network_alphas=network_alphas,
transformer=getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer,
adapter_name=adapter_name,
_pipeline=self,
@@ -1628,7 +1656,7 @@ def load_lora_weights(
if len(text_encoder_state_dict) > 0:
self.load_lora_into_text_encoder(
text_encoder_state_dict,
- network_alphas=None,
+ network_alphas=network_alphas,
text_encoder=self.text_encoder,
prefix="text_encoder",
lora_scale=self.lora_scale,
@@ -1637,8 +1665,7 @@ def load_lora_weights(
)
@classmethod
- # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer
- def load_lora_into_transformer(cls, state_dict, transformer, adapter_name=None, _pipeline=None):
+ def load_lora_into_transformer(cls, state_dict, network_alphas, transformer, adapter_name=None, _pipeline=None):
"""
This will load the LoRA layers specified in `state_dict` into `transformer`.
@@ -1647,6 +1674,10 @@ def load_lora_into_transformer(cls, state_dict, transformer, adapter_name=None,
A standard state dict containing the lora layer parameters. The keys can either be indexed directly
into the unet or prefixed with an additional `unet` which can be used to distinguish between text
encoder lora layers.
+ network_alphas (`Dict[str, float]`):
+ The value of the network alpha used for stable learning and preventing underflow. This value has the
+ same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+ link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
transformer (`SD3Transformer2DModel`):
The Transformer model to load the LoRA layers into.
adapter_name (`str`, *optional*):
@@ -1678,7 +1709,12 @@ def load_lora_into_transformer(cls, state_dict, transformer, adapter_name=None,
if "lora_B" in key:
rank[key] = val.shape[1]
- lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=None, peft_state_dict=state_dict)
+ if network_alphas is not None and len(network_alphas) >= 1:
+ prefix = cls.transformer_name
+ alpha_keys = [k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix]
+ network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
+
+ lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=network_alphas, peft_state_dict=state_dict)
if "use_dora" in lora_config_kwargs:
if lora_config_kwargs["use_dora"] and is_peft_version("<", "0.9.0"):
raise ValueError(
@@ -1735,7 +1771,9 @@ def load_lora_into_text_encoder(
A standard state dict containing the lora layer parameters. The key should be prefixed with an
additional `text_encoder` to distinguish between unet lora layers.
network_alphas (`Dict[str, float]`):
- See `LoRALinearLayer` for more details.
+ The value of the network alpha used for stable learning and preventing underflow. This value has the
+ same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+ link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
text_encoder (`CLIPTextModel`):
The text encoder model to load the LoRA layers into.
prefix (`str`):
@@ -1968,7 +2006,9 @@ def load_lora_into_transformer(cls, state_dict, network_alphas, transformer, ada
into the unet or prefixed with an additional `unet` which can be used to distinguish between text
encoder lora layers.
network_alphas (`Dict[str, float]`):
- See `LoRALinearLayer` for more details.
+ The value of the network alpha used for stable learning and preventing underflow. This value has the
+ same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+ link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
unet (`UNet2DConditionModel`):
The UNet model to load the LoRA layers into.
adapter_name (`str`, *optional*):
@@ -2061,7 +2101,9 @@ def load_lora_into_text_encoder(
A standard state dict containing the lora layer parameters. The key should be prefixed with an
additional `text_encoder` to distinguish between unet lora layers.
network_alphas (`Dict[str, float]`):
- See `LoRALinearLayer` for more details.
+ The value of the network alpha used for stable learning and preventing underflow. This value has the
+ same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this
+ link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning).
text_encoder (`CLIPTextModel`):
The text encoder model to load the LoRA layers into.
prefix (`str`):
diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index f8ef9a8a74ab..c0cbfc713857 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -23,6 +23,7 @@
from ..utils import deprecate, is_transformers_available, logging
from .single_file_utils import (
SingleFileComponentError,
+ _is_legacy_scheduler_kwargs,
_is_model_weights_in_cached_folder,
_legacy_load_clip_tokenizer,
_legacy_load_safety_checker,
@@ -42,7 +43,6 @@
# Legacy behaviour. `from_single_file` does not load the safety checker unless explicitly provided
SINGLE_FILE_OPTIONAL_COMPONENTS = ["safety_checker"]
-
if is_transformers_available():
import transformers
from transformers import PreTrainedModel, PreTrainedTokenizer
@@ -135,7 +135,7 @@ def load_single_file_sub_model(
class_obj, checkpoint=checkpoint, config=cached_model_config_path, local_files_only=local_files_only
)
- elif is_diffusers_scheduler and is_legacy_loading:
+ elif is_diffusers_scheduler and (is_legacy_loading or _is_legacy_scheduler_kwargs(kwargs)):
loaded_sub_model = _legacy_load_scheduler(
class_obj, checkpoint=checkpoint, component_name=name, original_config=original_config, **kwargs
)
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 9c2a2cbf2942..f13fcf23877a 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -79,7 +79,10 @@
"animatediff_sdxl_beta": "up_blocks.2.motion_modules.0.temporal_transformer.norm.weight",
"animatediff_scribble": "controlnet_cond_embedding.conv_in.weight",
"animatediff_rgb": "controlnet_cond_embedding.weight",
- "flux": "double_blocks.0.img_attn.norm.key_norm.scale",
+ "flux": [
+ "double_blocks.0.img_attn.norm.key_norm.scale",
+ "model.diffusion_model.double_blocks.0.img_attn.norm.key_norm.scale",
+ ],
}
DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
@@ -258,7 +261,7 @@
"timestep_spacing": "leading",
}
-LDM_VAE_KEY = "first_stage_model."
+LDM_VAE_KEYS = ["first_stage_model.", "vae."]
LDM_VAE_DEFAULT_SCALING_FACTOR = 0.18215
PLAYGROUND_VAE_SCALING_FACTOR = 0.5
LDM_UNET_KEY = "model.diffusion_model."
@@ -267,8 +270,8 @@
"cond_stage_model.transformer.",
"conditioner.embedders.0.transformer.",
]
-OPEN_CLIP_PREFIX = "conditioner.embedders.0.model."
LDM_OPEN_CLIP_TEXT_PROJECTION_DIM = 1024
+SCHEDULER_LEGACY_KWARGS = ["prediction_type", "scheduler_type"]
VALID_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
@@ -318,6 +321,10 @@ def _is_model_weights_in_cached_folder(cached_folder, name):
return weights_exist
+def _is_legacy_scheduler_kwargs(kwargs):
+ return any(k in SCHEDULER_LEGACY_KWARGS for k in kwargs.keys())
+
+
def load_single_file_checkpoint(
pretrained_model_link_or_path,
force_download=False,
@@ -449,6 +456,8 @@ def infer_diffusers_model_type(checkpoint):
):
if CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024:
model_type = "inpainting_v2"
+ elif CHECKPOINT_KEY_NAMES["xl_base"] in checkpoint:
+ model_type = "xl_inpaint"
else:
model_type = "inpainting"
@@ -516,8 +525,10 @@ def infer_diffusers_model_type(checkpoint):
else:
model_type = "animatediff_v3"
- elif CHECKPOINT_KEY_NAMES["flux"] in checkpoint:
- if "guidance_in.in_layer.bias" in checkpoint:
+ elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["flux"]):
+ if any(
+ g in checkpoint for g in ["guidance_in.in_layer.bias", "model.diffusion_model.guidance_in.in_layer.bias"]
+ ):
model_type = "flux-dev"
else:
model_type = "flux-schnell"
@@ -1176,7 +1187,11 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
# remove the LDM_VAE_KEY prefix from the ldm checkpoint keys so that it is easier to map them to diffusers keys
vae_state_dict = {}
keys = list(checkpoint.keys())
- vae_key = LDM_VAE_KEY if any(k.startswith(LDM_VAE_KEY) for k in keys) else ""
+ vae_key = ""
+ for ldm_vae_key in LDM_VAE_KEYS:
+ if any(k.startswith(ldm_vae_key) for k in keys):
+ vae_key = ldm_vae_key
+
for key in keys:
if key.startswith(vae_key):
vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
@@ -1477,14 +1492,22 @@ def _legacy_load_scheduler(
if scheduler_type is not None:
deprecation_message = (
- "Please pass an instance of a Scheduler object directly to the `scheduler` argument in `from_single_file`."
+ "Please pass an instance of a Scheduler object directly to the `scheduler` argument in `from_single_file`\n\n"
+ "Example:\n\n"
+ "from diffusers import StableDiffusionPipeline, DDIMScheduler\n\n"
+ "scheduler = DDIMScheduler()\n"
+ "pipe = StableDiffusionPipeline.from_single_file(, scheduler=scheduler)\n"
)
deprecate("scheduler_type", "1.0.0", deprecation_message)
if prediction_type is not None:
deprecation_message = (
- "Please configure an instance of a Scheduler with the appropriate `prediction_type` "
- "and pass the object directly to the `scheduler` argument in `from_single_file`."
+ "Please configure an instance of a Scheduler with the appropriate `prediction_type` and "
+ "pass the object directly to the `scheduler` argument in `from_single_file`.\n\n"
+ "Example:\n\n"
+ "from diffusers import StableDiffusionPipeline, DDIMScheduler\n\n"
+ 'scheduler = DDIMScheduler(prediction_type="v_prediction")\n'
+ "pipe = StableDiffusionPipeline.from_single_file(, scheduler=scheduler)\n"
)
deprecate("prediction_type", "1.0.0", deprecation_message)
@@ -1881,6 +1904,10 @@ def convert_animatediff_checkpoint_to_diffusers(checkpoint, **kwargs):
def convert_flux_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
converted_state_dict = {}
+ keys = list(checkpoint.keys())
+ for k in keys:
+ if "model.diffusion_model." in k:
+ checkpoint[k.replace("model.diffusion_model.", "")] = checkpoint.pop(k)
num_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "double_blocks." in k))[-1] + 1 # noqa: C401
num_single_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "single_blocks." in k))[-1] + 1 # noqa: C401
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index e2ab1606b345..9f9bc5a46e10 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -1695,52 +1695,32 @@ def __call__(
return hidden_states
-# YiYi to-do: refactor rope related functions/classes
-def apply_rope(xq, xk, freqs_cis):
- xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
- xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
- xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
- xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
- return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
-
-
-class FluxSingleAttnProcessor2_0:
- r"""
- Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
- """
+class FluxAttnProcessor2_0:
+ """Attention processor used typically in processing the SD3-like self-attention projections."""
def __init__(self):
if not hasattr(F, "scaled_dot_product_attention"):
- raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+ raise ImportError("FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
def __call__(
self,
attn: Attention,
- hidden_states: torch.Tensor,
- encoder_hidden_states: Optional[torch.Tensor] = None,
+ hidden_states: torch.FloatTensor,
+ encoder_hidden_states: torch.FloatTensor = None,
attention_mask: Optional[torch.FloatTensor] = None,
image_rotary_emb: Optional[torch.Tensor] = None,
- ) -> torch.Tensor:
- input_ndim = hidden_states.ndim
-
- if input_ndim == 4:
- batch_size, channel, height, width = hidden_states.shape
- hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
+ ) -> torch.FloatTensor:
batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ # `sample` projections.
query = attn.to_q(hidden_states)
- if encoder_hidden_states is None:
- encoder_hidden_states = hidden_states
-
- key = attn.to_k(encoder_hidden_states)
- value = attn.to_v(encoder_hidden_states)
+ key = attn.to_k(hidden_states)
+ value = attn.to_v(hidden_states)
inner_dim = key.shape[-1]
head_dim = inner_dim // attn.heads
query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
@@ -1749,33 +1729,68 @@ def __call__(
if attn.norm_k is not None:
key = attn.norm_k(key)
- # Apply RoPE if needed
+ # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+ if encoder_hidden_states is not None:
+ # `context` projections.
+ encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+ encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+ encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+
+ encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+ batch_size, -1, attn.heads, head_dim
+ ).transpose(1, 2)
+ encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+ batch_size, -1, attn.heads, head_dim
+ ).transpose(1, 2)
+ encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+ batch_size, -1, attn.heads, head_dim
+ ).transpose(1, 2)
+
+ if attn.norm_added_q is not None:
+ encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+ if attn.norm_added_k is not None:
+ encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+
+ # attention
+ query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+ key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+ value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+
if image_rotary_emb is not None:
- # YiYi to-do: update uising apply_rotary_emb
- # from ..embeddings import apply_rotary_emb
- # query = apply_rotary_emb(query, image_rotary_emb)
- # key = apply_rotary_emb(key, image_rotary_emb)
- query, key = apply_rope(query, key, image_rotary_emb)
+ from .embeddings import apply_rotary_emb
- # the output of sdp = (batch, num_heads, seq_len, head_dim)
- # TODO: add support for attn.scale when we move to Torch 2.1
- hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+ query = apply_rotary_emb(query, image_rotary_emb)
+ key = apply_rotary_emb(key, image_rotary_emb)
+ hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
hidden_states = hidden_states.to(query.dtype)
- if input_ndim == 4:
- hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+ if encoder_hidden_states is not None:
+ encoder_hidden_states, hidden_states = (
+ hidden_states[:, : encoder_hidden_states.shape[1]],
+ hidden_states[:, encoder_hidden_states.shape[1] :],
+ )
- return hidden_states
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+ encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+ return hidden_states, encoder_hidden_states
+ else:
+ return hidden_states
-class FluxAttnProcessor2_0:
+class FusedFluxAttnProcessor2_0:
"""Attention processor used typically in processing the SD3-like self-attention projections."""
def __init__(self):
if not hasattr(F, "scaled_dot_product_attention"):
- raise ImportError("FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+ raise ImportError(
+ "FusedFluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+ )
def __call__(
self,
@@ -1785,21 +1800,12 @@ def __call__(
attention_mask: Optional[torch.FloatTensor] = None,
image_rotary_emb: Optional[torch.Tensor] = None,
) -> torch.FloatTensor:
- input_ndim = hidden_states.ndim
- if input_ndim == 4:
- batch_size, channel, height, width = hidden_states.shape
- hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
- context_input_ndim = encoder_hidden_states.ndim
- if context_input_ndim == 4:
- batch_size, channel, height, width = encoder_hidden_states.shape
- encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
- batch_size = encoder_hidden_states.shape[0]
+ batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
# `sample` projections.
- query = attn.to_q(hidden_states)
- key = attn.to_k(hidden_states)
- value = attn.to_v(hidden_states)
+ qkv = attn.to_qkv(hidden_states)
+ split_size = qkv.shape[-1] // 3
+ query, key, value = torch.split(qkv, split_size, dim=-1)
inner_dim = key.shape[-1]
head_dim = inner_dim // attn.heads
@@ -1813,58 +1819,203 @@ def __call__(
if attn.norm_k is not None:
key = attn.norm_k(key)
+ # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
# `context` projections.
- encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
- encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
- encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+ if encoder_hidden_states is not None:
+ encoder_qkv = attn.to_added_qkv(encoder_hidden_states)
+ split_size = encoder_qkv.shape[-1] // 3
+ (
+ encoder_hidden_states_query_proj,
+ encoder_hidden_states_key_proj,
+ encoder_hidden_states_value_proj,
+ ) = torch.split(encoder_qkv, split_size, dim=-1)
- encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
- batch_size, -1, attn.heads, head_dim
- ).transpose(1, 2)
- encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
- batch_size, -1, attn.heads, head_dim
- ).transpose(1, 2)
- encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
- batch_size, -1, attn.heads, head_dim
- ).transpose(1, 2)
-
- if attn.norm_added_q is not None:
- encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
- if attn.norm_added_k is not None:
- encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+ encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+ batch_size, -1, attn.heads, head_dim
+ ).transpose(1, 2)
+ encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+ batch_size, -1, attn.heads, head_dim
+ ).transpose(1, 2)
+ encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+ batch_size, -1, attn.heads, head_dim
+ ).transpose(1, 2)
- # attention
- query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
- key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
- value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+ if attn.norm_added_q is not None:
+ encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+ if attn.norm_added_k is not None:
+ encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+
+ # attention
+ query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+ key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+ value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
if image_rotary_emb is not None:
- # YiYi to-do: update uising apply_rotary_emb
- # from ..embeddings import apply_rotary_emb
- # query = apply_rotary_emb(query, image_rotary_emb)
- # key = apply_rotary_emb(key, image_rotary_emb)
- query, key = apply_rope(query, key, image_rotary_emb)
+ from .embeddings import apply_rotary_emb
+
+ query = apply_rotary_emb(query, image_rotary_emb)
+ key = apply_rotary_emb(key, image_rotary_emb)
hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
hidden_states = hidden_states.to(query.dtype)
- encoder_hidden_states, hidden_states = (
- hidden_states[:, : encoder_hidden_states.shape[1]],
- hidden_states[:, encoder_hidden_states.shape[1] :],
+ if encoder_hidden_states is not None:
+ encoder_hidden_states, hidden_states = (
+ hidden_states[:, : encoder_hidden_states.shape[1]],
+ hidden_states[:, encoder_hidden_states.shape[1] :],
+ )
+
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+ encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+ return hidden_states, encoder_hidden_states
+ else:
+ return hidden_states
+
+
+class CogVideoXAttnProcessor2_0:
+ r"""
+ Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+ query and key vectors, but does not include spatial normalization.
+ """
+
+ def __init__(self):
+ if not hasattr(F, "scaled_dot_product_attention"):
+ raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+ def __call__(
+ self,
+ attn: Attention,
+ hidden_states: torch.Tensor,
+ encoder_hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ image_rotary_emb: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+ text_seq_length = encoder_hidden_states.size(1)
+
+ hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
)
+ if attention_mask is not None:
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+ query = attn.to_q(hidden_states)
+ key = attn.to_k(hidden_states)
+ value = attn.to_v(hidden_states)
+
+ inner_dim = key.shape[-1]
+ head_dim = inner_dim // attn.heads
+
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+ if attn.norm_q is not None:
+ query = attn.norm_q(query)
+ if attn.norm_k is not None:
+ key = attn.norm_k(key)
+
+ # Apply RoPE if needed
+ if image_rotary_emb is not None:
+ from .embeddings import apply_rotary_emb
+
+ query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
+ if not attn.is_cross_attention:
+ key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
+
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+
# linear proj
hidden_states = attn.to_out[0](hidden_states)
# dropout
hidden_states = attn.to_out[1](hidden_states)
- encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
- if input_ndim == 4:
- hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
- if context_input_ndim == 4:
- encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+ encoder_hidden_states, hidden_states = hidden_states.split(
+ [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+ )
+ return hidden_states, encoder_hidden_states
+
+
+class FusedCogVideoXAttnProcessor2_0:
+ r"""
+ Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+ query and key vectors, but does not include spatial normalization.
+ """
+
+ def __init__(self):
+ if not hasattr(F, "scaled_dot_product_attention"):
+ raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+ def __call__(
+ self,
+ attn: Attention,
+ hidden_states: torch.Tensor,
+ encoder_hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ image_rotary_emb: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+ text_seq_length = encoder_hidden_states.size(1)
+
+ hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+
+ if attention_mask is not None:
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+ qkv = attn.to_qkv(hidden_states)
+ split_size = qkv.shape[-1] // 3
+ query, key, value = torch.split(qkv, split_size, dim=-1)
+
+ inner_dim = key.shape[-1]
+ head_dim = inner_dim // attn.heads
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+ if attn.norm_q is not None:
+ query = attn.norm_q(query)
+ if attn.norm_k is not None:
+ key = attn.norm_k(key)
+
+ # Apply RoPE if needed
+ if image_rotary_emb is not None:
+ from .embeddings import apply_rotary_emb
+
+ query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
+ if not attn.is_cross_attention:
+ key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
+
+ hidden_states = F.scaled_dot_product_attention(
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+ )
+
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+
+ encoder_hidden_states, hidden_states = hidden_states.split(
+ [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+ )
return hidden_states, encoder_hidden_states
@@ -4105,6 +4256,17 @@ def __init__(self):
pass
+class FluxSingleAttnProcessor2_0(FluxAttnProcessor2_0):
+ r"""
+ Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+ """
+
+ def __init__(self):
+ deprecation_message = "`FluxSingleAttnProcessor2_0` is deprecated and will be removed in a future version. Please use `FluxAttnProcessor2_0` instead."
+ deprecate("FluxSingleAttnProcessor2_0", "0.32.0", deprecation_message)
+ super().__init__()
+
+
ADDED_KV_ATTENTION_PROCESSORS = (
AttnAddedKVProcessor,
SlicedAttnAddedKVProcessor,
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
index 3bf6e68d2628..17fa2bbf40f6 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
@@ -902,7 +902,7 @@ class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
Tuple of block output channels.
act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
sample_size (`int`, *optional*, defaults to `32`): Sample input size.
- scaling_factor (`float`, *optional*, defaults to 0.18215):
+ scaling_factor (`float`, *optional*, defaults to `1.15258426`):
The component-wise standard deviation of the trained latent space computed using the first batch of the
training set. This is used to scale the latent space to have unit variance when training the diffusion
model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
diff --git a/src/diffusers/models/controlnet_flux.py b/src/diffusers/models/controlnet_flux.py
index ba4933dcad67..b29930f81ea2 100644
--- a/src/diffusers/models/controlnet_flux.py
+++ b/src/diffusers/models/controlnet_flux.py
@@ -24,9 +24,9 @@
from ..models.modeling_utils import ModelMixin
from ..utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
from .controlnet import BaseOutput, zero_module
-from .embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings
+from .embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
from .modeling_outputs import Transformer2DModelOutput
-from .transformers.transformer_flux import EmbedND, FluxSingleTransformerBlock, FluxTransformerBlock
+from .transformers.transformer_flux import FluxSingleTransformerBlock, FluxTransformerBlock
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -59,7 +59,7 @@ def __init__(
self.out_channels = in_channels
self.inner_dim = num_attention_heads * attention_head_dim
- self.pos_embed = EmbedND(dim=self.inner_dim, theta=10000, axes_dim=axes_dims_rope)
+ self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
text_time_guidance_cls = (
CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
)
@@ -272,8 +272,20 @@ def forward(
)
encoder_hidden_states = self.context_embedder(encoder_hidden_states)
- txt_ids = txt_ids.expand(img_ids.size(0), -1, -1)
- ids = torch.cat((txt_ids, img_ids), dim=1)
+ if txt_ids.ndim == 3:
+ logger.warning(
+ "Passing `txt_ids` 3d torch.Tensor is deprecated."
+ "Please remove the batch dimension and pass it as a 2d torch Tensor"
+ )
+ txt_ids = txt_ids[0]
+ if img_ids.ndim == 3:
+ logger.warning(
+ "Passing `img_ids` 3d torch.Tensor is deprecated."
+ "Please remove the batch dimension and pass it as a 2d torch Tensor"
+ )
+ img_ids = img_ids[0]
+
+ ids = torch.cat((txt_ids, img_ids), dim=0)
image_rotary_emb = self.pos_embed(ids)
block_samples = ()
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index 1258964385da..d1366654c448 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -374,6 +374,90 @@ def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
return embeds
+def get_3d_rotary_pos_embed(
+ embed_dim, crops_coords, grid_size, temporal_size, theta: int = 10000, use_real: bool = True
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+ """
+ RoPE for video tokens with 3D structure.
+
+ Args:
+ embed_dim: (`int`):
+ The embedding dimension size, corresponding to hidden_size_head.
+ crops_coords (`Tuple[int]`):
+ The top-left and bottom-right coordinates of the crop.
+ grid_size (`Tuple[int]`):
+ The grid size of the spatial positional embedding (height, width).
+ temporal_size (`int`):
+ The size of the temporal dimension.
+ theta (`float`):
+ Scaling factor for frequency computation.
+ use_real (`bool`):
+ If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+
+ Returns:
+ `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
+ """
+ start, stop = crops_coords
+ grid_h = np.linspace(start[0], stop[0], grid_size[0], endpoint=False, dtype=np.float32)
+ grid_w = np.linspace(start[1], stop[1], grid_size[1], endpoint=False, dtype=np.float32)
+ grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32)
+
+ # Compute dimensions for each axis
+ dim_t = embed_dim // 4
+ dim_h = embed_dim // 8 * 3
+ dim_w = embed_dim // 8 * 3
+
+ # Temporal frequencies
+ freqs_t = 1.0 / (theta ** (torch.arange(0, dim_t, 2).float() / dim_t))
+ grid_t = torch.from_numpy(grid_t).float()
+ freqs_t = torch.einsum("n , f -> n f", grid_t, freqs_t)
+ freqs_t = freqs_t.repeat_interleave(2, dim=-1)
+
+ # Spatial frequencies for height and width
+ freqs_h = 1.0 / (theta ** (torch.arange(0, dim_h, 2).float() / dim_h))
+ freqs_w = 1.0 / (theta ** (torch.arange(0, dim_w, 2).float() / dim_w))
+ grid_h = torch.from_numpy(grid_h).float()
+ grid_w = torch.from_numpy(grid_w).float()
+ freqs_h = torch.einsum("n , f -> n f", grid_h, freqs_h)
+ freqs_w = torch.einsum("n , f -> n f", grid_w, freqs_w)
+ freqs_h = freqs_h.repeat_interleave(2, dim=-1)
+ freqs_w = freqs_w.repeat_interleave(2, dim=-1)
+
+ # Broadcast and concatenate tensors along specified dimension
+ def broadcast(tensors, dim=-1):
+ num_tensors = len(tensors)
+ shape_lens = {len(t.shape) for t in tensors}
+ assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
+ shape_len = list(shape_lens)[0]
+ dim = (dim + shape_len) if dim < 0 else dim
+ dims = list(zip(*(list(t.shape) for t in tensors)))
+ expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+ assert all(
+ [*(len(set(t[1])) <= 2 for t in expandable_dims)]
+ ), "invalid dimensions for broadcastable concatenation"
+ max_dims = [(t[0], max(t[1])) for t in expandable_dims]
+ expanded_dims = [(t[0], (t[1],) * num_tensors) for t in max_dims]
+ expanded_dims.insert(dim, (dim, dims[dim]))
+ expandable_shapes = list(zip(*(t[1] for t in expanded_dims)))
+ tensors = [t[0].expand(*t[1]) for t in zip(tensors, expandable_shapes)]
+ return torch.cat(tensors, dim=dim)
+
+ freqs = broadcast((freqs_t[:, None, None, :], freqs_h[None, :, None, :], freqs_w[None, None, :, :]), dim=-1)
+
+ t, h, w, d = freqs.shape
+ freqs = freqs.view(t * h * w, d)
+
+ # Generate sine and cosine components
+ sin = freqs.sin()
+ cos = freqs.cos()
+
+ if use_real:
+ return cos, sin
+ else:
+ freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+ return freqs_cis
+
+
def get_2d_rotary_pos_embed(embed_dim, crops_coords, grid_size, use_real=True):
"""
RoPE for image tokens with 2d structure.
@@ -446,6 +530,7 @@ def get_1d_rotary_pos_embed(
linear_factor=1.0,
ntk_factor=1.0,
repeat_interleave_real=True,
+ freqs_dtype=torch.float32, # torch.float32 (hunyuan, stable audio), torch.float64 (flux)
):
"""
Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
@@ -468,6 +553,8 @@ def get_1d_rotary_pos_embed(
repeat_interleave_real (`bool`, *optional*, defaults to `True`):
If `True` and `use_real`, real part and imaginary part are each interleaved with themselves to reach `dim`.
Otherwise, they are concateanted with themselves.
+ freqs_dtype (`torch.float32` or `torch.float64`, *optional*, defaults to `torch.float32`):
+ the dtype of the frequency tensor.
Returns:
`torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
"""
@@ -476,19 +563,19 @@ def get_1d_rotary_pos_embed(
if isinstance(pos, int):
pos = np.arange(pos)
theta = theta * ntk_factor
- freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) / linear_factor # [D/2]
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype)[: (dim // 2)] / dim)) / linear_factor # [D/2]
t = torch.from_numpy(pos).to(freqs.device) # type: ignore # [S]
- freqs = torch.outer(t, freqs).float() # type: ignore # [S, D/2]
+ freqs = torch.outer(t, freqs) # type: ignore # [S, D/2]
if use_real and repeat_interleave_real:
- freqs_cos = freqs.cos().repeat_interleave(2, dim=1) # [S, D]
- freqs_sin = freqs.sin().repeat_interleave(2, dim=1) # [S, D]
+ freqs_cos = freqs.cos().repeat_interleave(2, dim=1).float() # [S, D]
+ freqs_sin = freqs.sin().repeat_interleave(2, dim=1).float() # [S, D]
return freqs_cos, freqs_sin
elif use_real:
- freqs_cos = torch.cat([freqs.cos(), freqs.cos()], dim=-1) # [S, D]
- freqs_sin = torch.cat([freqs.sin(), freqs.sin()], dim=-1) # [S, D]
+ freqs_cos = torch.cat([freqs.cos(), freqs.cos()], dim=-1).float() # [S, D]
+ freqs_sin = torch.cat([freqs.sin(), freqs.sin()], dim=-1).float() # [S, D]
return freqs_cos, freqs_sin
else:
- freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 # [S, D/2]
+ freqs_cis = torch.polar(torch.ones_like(freqs), freqs).float() # complex64 # [S, D/2]
return freqs_cis
@@ -540,6 +627,31 @@ def apply_rotary_emb(
return x_out.type_as(x)
+class FluxPosEmbed(nn.Module):
+ # modified from https://github.com/black-forest-labs/flux/blob/c00d7c60b085fce8058b9df845e036090873f2ce/src/flux/modules/layers.py#L11
+ def __init__(self, theta: int, axes_dim: List[int]):
+ super().__init__()
+ self.theta = theta
+ self.axes_dim = axes_dim
+
+ def forward(self, ids: torch.Tensor) -> torch.Tensor:
+ n_axes = ids.shape[-1]
+ cos_out = []
+ sin_out = []
+ pos = ids.squeeze().float().cpu().numpy()
+ is_mps = ids.device.type == "mps"
+ freqs_dtype = torch.float32 if is_mps else torch.float64
+ for i in range(n_axes):
+ cos, sin = get_1d_rotary_pos_embed(
+ self.axes_dim[i], pos[:, i], repeat_interleave_real=True, use_real=True, freqs_dtype=freqs_dtype
+ )
+ cos_out.append(cos)
+ sin_out.append(sin)
+ freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device)
+ freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device)
+ return freqs_cos, freqs_sin
+
+
class TimestepEmbedding(nn.Module):
def __init__(
self,
diff --git a/src/diffusers/models/transformers/auraflow_transformer_2d.py b/src/diffusers/models/transformers/auraflow_transformer_2d.py
index f685e690cf81..ad64df0c0790 100644
--- a/src/diffusers/models/transformers/auraflow_transformer_2d.py
+++ b/src/diffusers/models/transformers/auraflow_transformer_2d.py
@@ -68,6 +68,21 @@ def __init__(
self.height, self.width = height // patch_size, width // patch_size
self.base_size = height // patch_size
+ def pe_selection_index_based_on_dim(self, h, w):
+ # select subset of positional embedding based on H, W, where H, W is size of latent
+ # PE will be viewed as 2d-grid, and H/p x W/p of the PE will be selected
+ # because original input are in flattened format, we have to flatten this 2d grid as well.
+ h_p, w_p = h // self.patch_size, w // self.patch_size
+ original_pe_indexes = torch.arange(self.pos_embed.shape[1])
+ h_max, w_max = int(self.pos_embed_max_size**0.5), int(self.pos_embed_max_size**0.5)
+ original_pe_indexes = original_pe_indexes.view(h_max, w_max)
+ starth = h_max // 2 - h_p // 2
+ endh = starth + h_p
+ startw = w_max // 2 - w_p // 2
+ endw = startw + w_p
+ original_pe_indexes = original_pe_indexes[starth:endh, startw:endw]
+ return original_pe_indexes.flatten()
+
def forward(self, latent):
batch_size, num_channels, height, width = latent.size()
latent = latent.view(
@@ -80,7 +95,8 @@ def forward(self, latent):
)
latent = latent.permute(0, 2, 4, 1, 3, 5).flatten(-3).flatten(1, 2)
latent = self.proj(latent)
- return latent + self.pos_embed
+ pe_index = self.pe_selection_index_based_on_dim(height, width)
+ return latent + self.pos_embed[:, pe_index]
# Taken from the original Aura flow inference code.
@@ -258,6 +274,7 @@ class AuraFlowTransformer2DModel(ModelMixin, ConfigMixin):
pos_embed_max_size (`int`, defaults to 4096): Maximum positions to embed from the image latents.
"""
+ _no_split_modules = ["AuraFlowJointTransformerBlock", "AuraFlowSingleTransformerBlock", "AuraFlowPatchEmbed"]
_supports_gradient_checkpointing = True
@register_to_config
diff --git a/src/diffusers/models/transformers/cogvideox_transformer_3d.py b/src/diffusers/models/transformers/cogvideox_transformer_3d.py
index 1030b0df04ff..c8d4b1896346 100644
--- a/src/diffusers/models/transformers/cogvideox_transformer_3d.py
+++ b/src/diffusers/models/transformers/cogvideox_transformer_3d.py
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Optional, Tuple, Union
import torch
from torch import nn
@@ -22,6 +22,7 @@
from ...utils import is_torch_version, logging
from ...utils.torch_utils import maybe_allow_in_graph
from ..attention import Attention, FeedForward
+from ..attention_processor import AttentionProcessor, CogVideoXAttnProcessor2_0, FusedCogVideoXAttnProcessor2_0
from ..embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps, get_3d_sincos_pos_embed
from ..modeling_outputs import Transformer2DModelOutput
from ..modeling_utils import ModelMixin
@@ -97,6 +98,7 @@ def __init__(
eps=1e-6,
bias=attention_bias,
out_bias=attention_out_bias,
+ processor=CogVideoXAttnProcessor2_0(),
)
# 2. Feed Forward
@@ -116,24 +118,24 @@ def forward(
hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor,
temb: torch.Tensor,
+ image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
) -> torch.Tensor:
+ text_seq_length = encoder_hidden_states.size(1)
+
+ # norm & modulate
norm_hidden_states, norm_encoder_hidden_states, gate_msa, enc_gate_msa = self.norm1(
hidden_states, encoder_hidden_states, temb
)
# attention
- text_length = norm_encoder_hidden_states.size(1)
-
- # CogVideoX uses concatenated text + video embeddings with self-attention instead of using
- # them in cross-attention individually
- norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
- attn_output = self.attn1(
+ attn_hidden_states, attn_encoder_hidden_states = self.attn1(
hidden_states=norm_hidden_states,
- encoder_hidden_states=None,
+ encoder_hidden_states=norm_encoder_hidden_states,
+ image_rotary_emb=image_rotary_emb,
)
- hidden_states = hidden_states + gate_msa * attn_output[:, text_length:]
- encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_output[:, :text_length]
+ hidden_states = hidden_states + gate_msa * attn_hidden_states
+ encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states
# norm & modulate
norm_hidden_states, norm_encoder_hidden_states, gate_ff, enc_gate_ff = self.norm2(
@@ -144,8 +146,9 @@ def forward(
norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
ff_output = self.ff(norm_hidden_states)
- hidden_states = hidden_states + gate_ff * ff_output[:, text_length:]
- encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_length]
+ hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
+ encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
+
return hidden_states, encoder_hidden_states
@@ -231,6 +234,7 @@ def __init__(
norm_eps: float = 1e-5,
spatial_interpolation_scale: float = 1.875,
temporal_interpolation_scale: float = 1.0,
+ use_rotary_positional_embeddings: bool = False,
):
super().__init__()
inner_dim = num_attention_heads * attention_head_dim
@@ -295,12 +299,113 @@ def __init__(
def _set_gradient_checkpointing(self, module, value=False):
self.gradient_checkpointing = value
+ @property
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
+ r"""
+ Returns:
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
+ indexed by its weight name.
+ """
+ # set recursively
+ processors = {}
+
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+ if hasattr(module, "get_processor"):
+ processors[f"{name}.processor"] = module.get_processor()
+
+ for sub_name, child in module.named_children():
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+ return processors
+
+ for name, module in self.named_children():
+ fn_recursive_add_processors(name, module, processors)
+
+ return processors
+
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+ r"""
+ Sets the attention processor to use to compute attention.
+
+ Parameters:
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
+ for **all** `Attention` layers.
+
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+ processor. This is strongly recommended when setting trainable attention processors.
+
+ """
+ count = len(self.attn_processors.keys())
+
+ if isinstance(processor, dict) and len(processor) != count:
+ raise ValueError(
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+ )
+
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+ if hasattr(module, "set_processor"):
+ if not isinstance(processor, dict):
+ module.set_processor(processor)
+ else:
+ module.set_processor(processor.pop(f"{name}.processor"))
+
+ for sub_name, child in module.named_children():
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+ for name, module in self.named_children():
+ fn_recursive_attn_processor(name, module, processor)
+
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedCogVideoXAttnProcessor2_0
+ def fuse_qkv_projections(self):
+ """
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+ are fused. For cross-attention modules, key and value projection matrices are fused.
+
+
+
+ This API is ๐งช experimental.
+
+
+ """
+ self.original_attn_processors = None
+
+ for _, attn_processor in self.attn_processors.items():
+ if "Added" in str(attn_processor.__class__.__name__):
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+ self.original_attn_processors = self.attn_processors
+
+ for module in self.modules():
+ if isinstance(module, Attention):
+ module.fuse_projections(fuse=True)
+
+ self.set_attn_processor(FusedCogVideoXAttnProcessor2_0())
+
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+ def unfuse_qkv_projections(self):
+ """Disables the fused QKV projection if enabled.
+
+
+
+ This API is ๐งช experimental.
+
+
+
+ """
+ if self.original_attn_processors is not None:
+ self.set_attn_processor(self.original_attn_processors)
+
def forward(
self,
hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor,
timestep: Union[int, float, torch.LongTensor],
timestep_cond: Optional[torch.Tensor] = None,
+ image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
return_dict: bool = True,
):
batch_size, num_frames, channels, height, width = hidden_states.shape
@@ -319,14 +424,16 @@ def forward(
hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
# 3. Position embedding
- seq_length = height * width * num_frames // (self.config.patch_size**2)
+ text_seq_length = encoder_hidden_states.shape[1]
+ if not self.config.use_rotary_positional_embeddings:
+ seq_length = height * width * num_frames // (self.config.patch_size**2)
- pos_embeds = self.pos_embedding[:, : self.config.max_text_seq_length + seq_length]
- hidden_states = hidden_states + pos_embeds
- hidden_states = self.embedding_dropout(hidden_states)
+ pos_embeds = self.pos_embedding[:, : text_seq_length + seq_length]
+ hidden_states = hidden_states + pos_embeds
+ hidden_states = self.embedding_dropout(hidden_states)
- encoder_hidden_states = hidden_states[:, : self.config.max_text_seq_length]
- hidden_states = hidden_states[:, self.config.max_text_seq_length :]
+ encoder_hidden_states = hidden_states[:, :text_seq_length]
+ hidden_states = hidden_states[:, text_seq_length:]
# 4. Transformer blocks
for i, block in enumerate(self.transformer_blocks):
@@ -344,6 +451,7 @@ def custom_forward(*inputs):
hidden_states,
encoder_hidden_states,
emb,
+ image_rotary_emb,
**ckpt_kwargs,
)
else:
@@ -351,9 +459,17 @@ def custom_forward(*inputs):
hidden_states=hidden_states,
encoder_hidden_states=encoder_hidden_states,
temb=emb,
+ image_rotary_emb=image_rotary_emb,
)
- hidden_states = self.norm_final(hidden_states)
+ if not self.config.use_rotary_positional_embeddings:
+ # CogVideoX-2B
+ hidden_states = self.norm_final(hidden_states)
+ else:
+ # CogVideoX-5B
+ hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+ hidden_states = self.norm_final(hidden_states)
+ hidden_states = hidden_states[:, text_seq_length:]
# 5. Final block
hidden_states = self.norm_out(hidden_states, temb=emb)
diff --git a/src/diffusers/models/transformers/pixart_transformer_2d.py b/src/diffusers/models/transformers/pixart_transformer_2d.py
index 5c9c61243c07..1e5cd5794517 100644
--- a/src/diffusers/models/transformers/pixart_transformer_2d.py
+++ b/src/diffusers/models/transformers/pixart_transformer_2d.py
@@ -19,7 +19,7 @@
from ...configuration_utils import ConfigMixin, register_to_config
from ...utils import is_torch_version, logging
from ..attention import BasicTransformerBlock
-from ..attention_processor import Attention, AttentionProcessor, FusedAttnProcessor2_0
+from ..attention_processor import Attention, AttentionProcessor, AttnProcessor, FusedAttnProcessor2_0
from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
from ..modeling_outputs import Transformer2DModelOutput
from ..modeling_utils import ModelMixin
@@ -247,6 +247,14 @@ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
for name, module in self.named_children():
fn_recursive_attn_processor(name, module, processor)
+ def set_default_attn_processor(self):
+ """
+ Disables custom attention processors and sets the default attention implementation.
+
+ Safe to just use `AttnProcessor()` as PixArt doesn't have any exotic attention processors in default model.
+ """
+ self.set_attn_processor(AttnProcessor())
+
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
def fuse_qkv_projections(self):
"""
diff --git a/src/diffusers/models/transformers/transformer_flux.py b/src/diffusers/models/transformers/transformer_flux.py
index 3168fd9a625f..fd0881a14880 100644
--- a/src/diffusers/models/transformers/transformer_flux.py
+++ b/src/diffusers/models/transformers/transformer_flux.py
@@ -13,7 +13,7 @@
# limitations under the License.
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, Optional, Tuple, Union
import numpy as np
import torch
@@ -23,52 +23,23 @@
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
from ...models.attention import FeedForward
-from ...models.attention_processor import Attention, FluxAttnProcessor2_0, FluxSingleAttnProcessor2_0
+from ...models.attention_processor import (
+ Attention,
+ AttentionProcessor,
+ FluxAttnProcessor2_0,
+ FusedFluxAttnProcessor2_0,
+)
from ...models.modeling_utils import ModelMixin
from ...models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
from ...utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
from ...utils.torch_utils import maybe_allow_in_graph
-from ..embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings
+from ..embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
from ..modeling_outputs import Transformer2DModelOutput
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-# YiYi to-do: refactor rope related functions/classes
-def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
- assert dim % 2 == 0, "The dimension must be even."
-
- scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
- omega = 1.0 / (theta**scale)
-
- batch_size, seq_length = pos.shape
- out = torch.einsum("...n,d->...nd", pos, omega)
- cos_out = torch.cos(out)
- sin_out = torch.sin(out)
-
- stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1)
- out = stacked_out.view(batch_size, -1, dim // 2, 2, 2)
- return out.float()
-
-
-# YiYi to-do: refactor rope related functions/classes
-class EmbedND(nn.Module):
- def __init__(self, dim: int, theta: int, axes_dim: List[int]):
- super().__init__()
- self.dim = dim
- self.theta = theta
- self.axes_dim = axes_dim
-
- def forward(self, ids: torch.Tensor) -> torch.Tensor:
- n_axes = ids.shape[-1]
- emb = torch.cat(
- [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
- dim=-3,
- )
- return emb.unsqueeze(1)
-
-
@maybe_allow_in_graph
class FluxSingleTransformerBlock(nn.Module):
r"""
@@ -93,7 +64,7 @@ def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
self.act_mlp = nn.GELU(approximate="tanh")
self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
- processor = FluxSingleAttnProcessor2_0()
+ processor = FluxAttnProcessor2_0()
self.attn = Attention(
query_dim=dim,
cross_attention_dim=None,
@@ -251,6 +222,7 @@ class FluxTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOrig
"""
_supports_gradient_checkpointing = True
+ _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
@register_to_config
def __init__(
@@ -264,13 +236,14 @@ def __init__(
joint_attention_dim: int = 4096,
pooled_projection_dim: int = 768,
guidance_embeds: bool = False,
- axes_dims_rope: List[int] = [16, 56, 56],
+ axes_dims_rope: Tuple[int] = (16, 56, 56),
):
super().__init__()
self.out_channels = in_channels
self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
- self.pos_embed = EmbedND(dim=self.inner_dim, theta=10000, axes_dim=axes_dims_rope)
+ self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+
text_time_guidance_cls = (
CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
)
@@ -308,6 +281,106 @@ def __init__(
self.gradient_checkpointing = False
+ @property
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
+ r"""
+ Returns:
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
+ indexed by its weight name.
+ """
+ # set recursively
+ processors = {}
+
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+ if hasattr(module, "get_processor"):
+ processors[f"{name}.processor"] = module.get_processor()
+
+ for sub_name, child in module.named_children():
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+ return processors
+
+ for name, module in self.named_children():
+ fn_recursive_add_processors(name, module, processors)
+
+ return processors
+
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+ r"""
+ Sets the attention processor to use to compute attention.
+
+ Parameters:
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
+ for **all** `Attention` layers.
+
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+ processor. This is strongly recommended when setting trainable attention processors.
+
+ """
+ count = len(self.attn_processors.keys())
+
+ if isinstance(processor, dict) and len(processor) != count:
+ raise ValueError(
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+ )
+
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+ if hasattr(module, "set_processor"):
+ if not isinstance(processor, dict):
+ module.set_processor(processor)
+ else:
+ module.set_processor(processor.pop(f"{name}.processor"))
+
+ for sub_name, child in module.named_children():
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+ for name, module in self.named_children():
+ fn_recursive_attn_processor(name, module, processor)
+
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
+ def fuse_qkv_projections(self):
+ """
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+ are fused. For cross-attention modules, key and value projection matrices are fused.
+
+
+
+ This API is ๐งช experimental.
+
+
+ """
+ self.original_attn_processors = None
+
+ for _, attn_processor in self.attn_processors.items():
+ if "Added" in str(attn_processor.__class__.__name__):
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+ self.original_attn_processors = self.attn_processors
+
+ for module in self.modules():
+ if isinstance(module, Attention):
+ module.fuse_projections(fuse=True)
+
+ self.set_attn_processor(FusedFluxAttnProcessor2_0())
+
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+ def unfuse_qkv_projections(self):
+ """Disables the fused QKV projection if enabled.
+
+
+
+ This API is ๐งช experimental.
+
+
+
+ """
+ if self.original_attn_processors is not None:
+ self.set_attn_processor(self.original_attn_processors)
+
def _set_gradient_checkpointing(self, module, value=False):
if hasattr(module, "gradient_checkpointing"):
module.gradient_checkpointing = value
@@ -380,8 +453,19 @@ def forward(
)
encoder_hidden_states = self.context_embedder(encoder_hidden_states)
- txt_ids = txt_ids.expand(img_ids.size(0), -1, -1)
- ids = torch.cat((txt_ids, img_ids), dim=1)
+ if txt_ids.ndim == 3:
+ logger.warning(
+ "Passing `txt_ids` 3d torch.Tensor is deprecated."
+ "Please remove the batch dimension and pass it as a 2d torch Tensor"
+ )
+ txt_ids = txt_ids[0]
+ if img_ids.ndim == 3:
+ logger.warning(
+ "Passing `img_ids` 3d torch.Tensor is deprecated."
+ "Please remove the batch dimension and pass it as a 2d torch Tensor"
+ )
+ img_ids = img_ids[0]
+ ids = torch.cat((txt_ids, img_ids), dim=0)
image_rotary_emb = self.pos_embed(ids)
for index_block, block in enumerate(self.transformer_blocks):
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 84e9f0b2e0eb..63436e9be6b5 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -154,6 +154,7 @@
"StableDiffusionControlNetPAGPipeline",
"StableDiffusionXLPAGPipeline",
"StableDiffusionXLPAGInpaintPipeline",
+ "StableDiffusionXLControlNetPAGImg2ImgPipeline",
"StableDiffusionXLControlNetPAGPipeline",
"StableDiffusionXLPAGImg2ImgPipeline",
"PixArtSigmaPAGPipeline",
@@ -547,6 +548,7 @@
StableDiffusion3PAGPipeline,
StableDiffusionControlNetPAGPipeline,
StableDiffusionPAGPipeline,
+ StableDiffusionXLControlNetPAGImg2ImgPipeline,
StableDiffusionXLControlNetPAGPipeline,
StableDiffusionXLPAGImg2ImgPipeline,
StableDiffusionXLPAGInpaintPipeline,
diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py
index 5ea077db04ba..4daf0e7717e7 100644
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -49,12 +49,14 @@
)
from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
+from .lumina import LuminaText2ImgPipeline
from .pag import (
HunyuanDiTPAGPipeline,
PixArtSigmaPAGPipeline,
StableDiffusion3PAGPipeline,
StableDiffusionControlNetPAGPipeline,
StableDiffusionPAGPipeline,
+ StableDiffusionXLControlNetPAGImg2ImgPipeline,
StableDiffusionXLControlNetPAGPipeline,
StableDiffusionXLPAGImg2ImgPipeline,
StableDiffusionXLPAGInpaintPipeline,
@@ -106,6 +108,7 @@
("pixart-sigma-pag", PixArtSigmaPAGPipeline),
("auraflow", AuraFlowPipeline),
("flux", FluxPipeline),
+ ("lumina", LuminaText2ImgPipeline),
]
)
@@ -121,6 +124,7 @@
("stable-diffusion-controlnet", StableDiffusionControlNetImg2ImgPipeline),
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetImg2ImgPipeline),
("stable-diffusion-xl-pag", StableDiffusionXLPAGImg2ImgPipeline),
+ ("stable-diffusion-xl-controlnet-pag", StableDiffusionXLControlNetPAGImg2ImgPipeline),
("lcm", LatentConsistencyModelImg2ImgPipeline),
]
)
@@ -161,12 +165,12 @@
)
if is_sentencepiece_available():
- from .kolors import KolorsPipeline
+ from .kolors import KolorsImg2ImgPipeline, KolorsPipeline
from .pag import KolorsPAGPipeline
AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsPipeline
AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors-pag"] = KolorsPAGPipeline
- AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsPipeline
+ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsImg2ImgPipeline
SUPPORTED_TASKS_MAPPINGS = [
AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
@@ -953,7 +957,8 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
if "enable_pag" in kwargs:
enable_pag = kwargs.pop("enable_pag")
if enable_pag:
- orig_class_name = config["_class_name"].replace("Pipeline", "PAGPipeline")
+ to_replace = "InpaintPipeline" if "Inpaint" in config["_class_name"] else "Pipeline"
+ orig_class_name = config["_class_name"].replace(to_replace, "PAG" + to_replace)
inpainting_cls = _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, orig_class_name)
diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py
index 2ab02e91c2e8..fd8be5597ee7 100644
--- a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py
+++ b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py
@@ -23,6 +23,7 @@
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
+from ...models.embeddings import get_3d_rotary_pos_embed
from ...pipelines.pipeline_utils import DiffusionPipeline
from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
from ...utils import BaseOutput, logging, replace_example_docstring
@@ -40,6 +41,7 @@
>>> from diffusers import CogVideoXPipeline
>>> from diffusers.utils import export_to_video
+ >>> # Models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
>>> pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to("cuda")
>>> prompt = (
... "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
@@ -55,6 +57,25 @@
"""
+# Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
+def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
+ tw = tgt_width
+ th = tgt_height
+ h, w = src
+ r = h / w
+ if r > (th / tw):
+ resize_height = th
+ resize_width = int(round(th / h * w))
+ else:
+ resize_width = tw
+ resize_height = int(round(tw / w * h))
+
+ crop_top = int(round((th - resize_height) / 2.0))
+ crop_left = int(round((tw - resize_width) / 2.0))
+
+ return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
def retrieve_timesteps(
scheduler,
@@ -409,6 +430,46 @@ def check_inputs(
f" {negative_prompt_embeds.shape}."
)
+ def fuse_qkv_projections(self) -> None:
+ r"""Enables fused QKV projections."""
+ self.fusing_transformer = True
+ self.transformer.fuse_qkv_projections()
+
+ def unfuse_qkv_projections(self) -> None:
+ r"""Disable QKV projection fusion if enabled."""
+ if not self.fusing_transformer:
+ logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
+ else:
+ self.transformer.unfuse_qkv_projections()
+ self.fusing_transformer = False
+
+ def _prepare_rotary_positional_embeddings(
+ self,
+ height: int,
+ width: int,
+ num_frames: int,
+ device: torch.device,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+ grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+ base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+ base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+
+ grid_crops_coords = get_resize_crop_region_for_grid(
+ (grid_height, grid_width), base_size_width, base_size_height
+ )
+ freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+ embed_dim=self.transformer.config.attention_head_dim,
+ crops_coords=grid_crops_coords,
+ grid_size=(grid_height, grid_width),
+ temporal_size=num_frames,
+ use_real=True,
+ )
+
+ freqs_cos = freqs_cos.to(device=device)
+ freqs_sin = freqs_sin.to(device=device)
+ return freqs_cos, freqs_sin
+
@property
def guidance_scale(self):
return self._guidance_scale
@@ -599,7 +660,14 @@ def __call__(
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
- # 7. Denoising loop
+ # 7. Create rotary embeds if required
+ image_rotary_emb = (
+ self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+ if self.transformer.config.use_rotary_positional_embeddings
+ else None
+ )
+
+ # 8. Denoising loop
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -620,6 +688,7 @@ def __call__(
hidden_states=latent_model_input,
encoder_hidden_states=prompt_embeds,
timestep=timestep,
+ image_rotary_emb=image_rotary_emb,
return_dict=False,
)[0]
noise_pred = noise_pred.float()
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
index 278d8d953e9b..ba0a34ad36bb 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -1538,7 +1538,6 @@ def __call__(
if isinstance(controlnet_cond_scale, list):
controlnet_cond_scale = controlnet_cond_scale[0]
cond_scale = controlnet_cond_scale * controlnet_keep[i]
-
down_block_res_samples, mid_block_res_sample = self.controlnet(
control_model_input,
t,
diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py
index 513feaabc41f..5a413455e447 100644
--- a/src/diffusers/pipelines/flux/pipeline_flux.py
+++ b/src/diffusers/pipelines/flux/pipeline_flux.py
@@ -20,7 +20,7 @@
from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
from ...image_processor import VaeImageProcessor
-from ...loaders import FluxLoraLoaderMixin
+from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin
from ...models.autoencoders import AutoencoderKL
from ...models.transformers import FluxTransformer2DModel
from ...schedulers import FlowMatchEulerDiscreteScheduler
@@ -137,7 +137,7 @@ def retrieve_timesteps(
return timesteps, num_inference_steps
-class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
+class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
r"""
The Flux pipeline for text-to-image generation.
@@ -331,10 +331,6 @@ def encode_prompt(
scale_lora_layers(self.text_encoder_2, lora_scale)
prompt = [prompt] if isinstance(prompt, str) else prompt
- if prompt is not None:
- batch_size = len(prompt)
- else:
- batch_size = prompt_embeds.shape[0]
if prompt_embeds is None:
prompt_2 = prompt_2 or prompt
@@ -364,8 +360,7 @@ def encode_prompt(
unscale_lora_layers(self.text_encoder_2, lora_scale)
dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
- text_ids = torch.zeros(batch_size, prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
- text_ids = text_ids.repeat(num_images_per_prompt, 1, 1)
+ text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
return prompt_embeds, pooled_prompt_embeds, text_ids
@@ -425,9 +420,8 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
- latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1)
latent_image_ids = latent_image_ids.reshape(
- batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels
+ latent_image_id_height * latent_image_id_width, latent_image_id_channels
)
return latent_image_ids.to(device=device, dtype=dtype)
@@ -454,6 +448,35 @@ def _unpack_latents(latents, height, width, vae_scale_factor):
return latents
+ def enable_vae_slicing(self):
+ r"""
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+ """
+ self.vae.enable_slicing()
+
+ def disable_vae_slicing(self):
+ r"""
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_slicing()
+
+ def enable_vae_tiling(self):
+ r"""
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+ processing larger images.
+ """
+ self.vae.enable_tiling()
+
+ def disable_vae_tiling(self):
+ r"""
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_tiling()
+
def prepare_latents(
self,
batch_size,
@@ -677,6 +700,13 @@ def __call__(
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
self._num_timesteps = len(timesteps)
+ # handle guidance
+ if self.transformer.config.guidance_embeds:
+ guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+ guidance = guidance.expand(latents.shape[0])
+ else:
+ guidance = None
+
# 6. Denoising loop
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
@@ -686,16 +716,8 @@ def __call__(
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
timestep = t.expand(latents.shape[0]).to(latents.dtype)
- # handle guidance
- if self.transformer.config.guidance_embeds:
- guidance = torch.tensor([guidance_scale], device=device)
- guidance = guidance.expand(latents.shape[0])
- else:
- guidance = None
-
noise_pred = self.transformer(
hidden_states=latents,
- # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
timestep=timestep / 1000,
guidance=guidance,
pooled_projections=pooled_prompt_embeds,
diff --git a/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py b/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py
index b7b956fca406..8d4d387b9147 100644
--- a/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py
+++ b/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py
@@ -25,7 +25,7 @@
)
from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FluxLoraLoaderMixin
+from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin
from ...models.autoencoders import AutoencoderKL
from ...models.controlnet_flux import FluxControlNetModel
from ...models.transformers import FluxTransformer2DModel
@@ -155,7 +155,7 @@ def retrieve_timesteps(
return timesteps, num_inference_steps
-class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
+class FluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
r"""
The Flux pipeline for text-to-image generation.
@@ -354,10 +354,6 @@ def encode_prompt(
scale_lora_layers(self.text_encoder_2, lora_scale)
prompt = [prompt] if isinstance(prompt, str) else prompt
- if prompt is not None:
- batch_size = len(prompt)
- else:
- batch_size = prompt_embeds.shape[0]
if prompt_embeds is None:
prompt_2 = prompt_2 or prompt
@@ -387,8 +383,7 @@ def encode_prompt(
unscale_lora_layers(self.text_encoder_2, lora_scale)
dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
- text_ids = torch.zeros(batch_size, prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
- text_ids = text_ids.repeat(num_images_per_prompt, 1, 1)
+ text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
return prompt_embeds, pooled_prompt_embeds, text_ids
@@ -449,9 +444,8 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
- latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1)
latent_image_ids = latent_image_ids.reshape(
- batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels
+ latent_image_id_height * latent_image_id_width, latent_image_id_channels
)
return latent_image_ids.to(device=device, dtype=dtype)
@@ -804,7 +798,6 @@ def __call__(
noise_pred = self.transformer(
hidden_states=latents,
- # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
timestep=timestep / 1000,
guidance=guidance,
pooled_projections=pooled_prompt_embeds,
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
index 9db767681b04..68334fef3811 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
@@ -547,7 +547,7 @@ def __call__(
negative_image_embeds = prior_outputs[1]
prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
- image = [image] if isinstance(prompt, PIL.Image.Image) else image
+ image = [image] if isinstance(image, PIL.Image.Image) else image
if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
prompt = (image_embeds.shape[0] // len(prompt)) * prompt
@@ -813,7 +813,7 @@ def __call__(
negative_image_embeds = prior_outputs[1]
prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
- image = [image] if isinstance(prompt, PIL.Image.Image) else image
+ image = [image] if isinstance(image, PIL.Image.Image) else image
mask_image = [mask_image] if isinstance(mask_image, PIL.Image.Image) else mask_image
if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
diff --git a/src/diffusers/pipelines/latte/pipeline_latte.py b/src/diffusers/pipelines/latte/pipeline_latte.py
index 0e52a7d5f4c0..8b78a7e75681 100644
--- a/src/diffusers/pipelines/latte/pipeline_latte.py
+++ b/src/diffusers/pipelines/latte/pipeline_latte.py
@@ -56,7 +56,7 @@
>>> from diffusers.utils import export_to_gif
>>> # You can replace the checkpoint id with "maxin-cn/Latte-1" too.
- >>> pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16).to("cuda")
+ >>> pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16)
>>> # Enable memory optimizations.
>>> pipe.enable_model_cpu_offload()
diff --git a/src/diffusers/pipelines/lumina/pipeline_lumina.py b/src/diffusers/pipelines/lumina/pipeline_lumina.py
index be6fe203d7e5..1ded6013fb35 100644
--- a/src/diffusers/pipelines/lumina/pipeline_lumina.py
+++ b/src/diffusers/pipelines/lumina/pipeline_lumina.py
@@ -54,7 +54,7 @@
>>> pipe = LuminaText2ImgPipeline.from_pretrained(
... "Alpha-VLLM/Lumina-Next-SFT-diffusers", torch_dtype=torch.bfloat16
- ... ).cuda()
+ ... )
>>> # Enable memory optimizations.
>>> pipe.enable_model_cpu_offload()
diff --git a/src/diffusers/pipelines/pag/__init__.py b/src/diffusers/pipelines/pag/__init__.py
index 5635fdebbe29..d8842ce91175 100644
--- a/src/diffusers/pipelines/pag/__init__.py
+++ b/src/diffusers/pipelines/pag/__init__.py
@@ -24,6 +24,7 @@
else:
_import_structure["pipeline_pag_controlnet_sd"] = ["StableDiffusionControlNetPAGPipeline"]
_import_structure["pipeline_pag_controlnet_sd_xl"] = ["StableDiffusionXLControlNetPAGPipeline"]
+ _import_structure["pipeline_pag_controlnet_sd_xl_img2img"] = ["StableDiffusionXLControlNetPAGImg2ImgPipeline"]
_import_structure["pipeline_pag_hunyuandit"] = ["HunyuanDiTPAGPipeline"]
_import_structure["pipeline_pag_kolors"] = ["KolorsPAGPipeline"]
_import_structure["pipeline_pag_pixart_sigma"] = ["PixArtSigmaPAGPipeline"]
@@ -44,6 +45,7 @@
else:
from .pipeline_pag_controlnet_sd import StableDiffusionControlNetPAGPipeline
from .pipeline_pag_controlnet_sd_xl import StableDiffusionXLControlNetPAGPipeline
+ from .pipeline_pag_controlnet_sd_xl_img2img import StableDiffusionXLControlNetPAGImg2ImgPipeline
from .pipeline_pag_hunyuandit import HunyuanDiTPAGPipeline
from .pipeline_pag_kolors import KolorsPAGPipeline
from .pipeline_pag_pixart_sigma import PixArtSigmaPAGPipeline
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
new file mode 100644
index 000000000000..66398483e046
--- /dev/null
+++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
@@ -0,0 +1,1685 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import (
+ CLIPImageProcessor,
+ CLIPTextModel,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+ CLIPVisionModelWithProjection,
+)
+
+from diffusers.utils.import_utils import is_invisible_watermark_available
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+ FromSingleFileMixin,
+ IPAdapterMixin,
+ StableDiffusionXLLoraLoaderMixin,
+ TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+ AttnProcessor2_0,
+ XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+ USE_PEFT_BACKEND,
+ logging,
+ replace_example_docstring,
+ scale_lora_layers,
+ unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from .pag_utils import PAGMixin
+
+
+if is_invisible_watermark_available():
+ from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+from ..controlnet.multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+ Examples:
+ ```py
+ >>> # pip install accelerate transformers safetensors diffusers
+
+ >>> import torch
+ >>> import numpy as np
+ >>> from PIL import Image
+
+ >>> from transformers import DPTFeatureExtractor, DPTForDepthEstimation
+ >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetPAGImg2ImgPipeline, AutoencoderKL
+ >>> from diffusers.utils import load_image
+
+
+ >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
+ >>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
+ >>> controlnet = ControlNetModel.from_pretrained(
+ ... "diffusers/controlnet-depth-sdxl-1.0-small",
+ ... variant="fp16",
+ ... use_safetensors="True",
+ ... torch_dtype=torch.float16,
+ ... )
+ >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+ >>> pipe = StableDiffusionXLControlNetPAGImg2ImgPipeline.from_pretrained(
+ ... "stabilityai/stable-diffusion-xl-base-1.0",
+ ... controlnet=controlnet,
+ ... vae=vae,
+ ... variant="fp16",
+ ... use_safetensors=True,
+ ... torch_dtype=torch.float16,
+ ... enable_pag=True,
+ ... )
+ >>> pipe.enable_model_cpu_offload()
+
+
+ >>> def get_depth_map(image):
+ ... image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
+ ... with torch.no_grad(), torch.autocast("cuda"):
+ ... depth_map = depth_estimator(image).predicted_depth
+
+ ... depth_map = torch.nn.fuctional.interpolate(
+ ... depth_map.unsqueeze(1),
+ ... size=(1024, 1024),
+ ... mode="bicubic",
+ ... align_corners=False,
+ ... )
+ ... depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
+ ... depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
+ ... depth_map = (depth_map - depth_min) / (depth_max - depth_min)
+ ... image = torch.cat([depth_map] * 3, dim=1)
+ ... image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
+ ... image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
+ ... return image
+
+
+ >>> prompt = "A robot, 4k photo"
+ >>> image = load_image(
+ ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+ ... "/kandinsky/cat.png"
+ ... ).resize((1024, 1024))
+ >>> controlnet_conditioning_scale = 0.5 # recommended for good generalization
+ >>> depth_image = get_depth_map(image)
+
+ >>> images = pipe(
+ ... prompt,
+ ... image=image,
+ ... control_image=depth_image,
+ ... strength=0.99,
+ ... num_inference_steps=50,
+ ... controlnet_conditioning_scale=controlnet_conditioning_scale,
+ ... ).images
+ >>> images[0].save(f"robot_cat.png")
+ ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+ return encoder_output.latent_dist.sample(generator)
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+ return encoder_output.latent_dist.mode()
+ elif hasattr(encoder_output, "latents"):
+ return encoder_output.latents
+ else:
+ raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class StableDiffusionXLControlNetPAGImg2ImgPipeline(
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ StableDiffusionXLLoraLoaderMixin,
+ FromSingleFileMixin,
+ IPAdapterMixin,
+ PAGMixin,
+):
+ r"""
+ Pipeline for image-to-image generation using Stable Diffusion XL with ControlNet guidance.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ The pipeline also inherits the following loading methods:
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+ - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+ - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ text_encoder_2 ([` CLIPTextModelWithProjection`]):
+ Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+ specifically the
+ [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+ variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ tokenizer_2 (`CLIPTokenizer`):
+ Second Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+ Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
+ as a list, the outputs from each ControlNet are added together to create one combined additional
+ conditioning.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+ Whether the `unet` requires an `aesthetic_score` condition to be passed during inference. Also see the
+ config of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+ force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+ Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+ `stabilityai/stable-diffusion-xl-base-1-0`.
+ add_watermarker (`bool`, *optional*):
+ Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+ watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+ watermarker will be used.
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+ """
+
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+ _optional_components = [
+ "tokenizer",
+ "tokenizer_2",
+ "text_encoder",
+ "text_encoder_2",
+ "feature_extractor",
+ "image_encoder",
+ ]
+ _callback_tensor_inputs = [
+ "latents",
+ "prompt_embeds",
+ "negative_prompt_embeds",
+ "add_text_embeds",
+ "add_time_ids",
+ "negative_pooled_prompt_embeds",
+ "add_neg_time_ids",
+ ]
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ text_encoder_2: CLIPTextModelWithProjection,
+ tokenizer: CLIPTokenizer,
+ tokenizer_2: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+ scheduler: KarrasDiffusionSchedulers,
+ requires_aesthetics_score: bool = False,
+ force_zeros_for_empty_prompt: bool = True,
+ add_watermarker: Optional[bool] = None,
+ feature_extractor: CLIPImageProcessor = None,
+ image_encoder: CLIPVisionModelWithProjection = None,
+ pag_applied_layers: Union[str, List[str]] = "mid", # ["mid"], ["down.block_1", "up.block_0.attentions_0"]
+ ):
+ super().__init__()
+
+ if isinstance(controlnet, (list, tuple)):
+ controlnet = MultiControlNetModel(controlnet)
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ text_encoder_2=text_encoder_2,
+ tokenizer=tokenizer,
+ tokenizer_2=tokenizer_2,
+ unet=unet,
+ controlnet=controlnet,
+ scheduler=scheduler,
+ feature_extractor=feature_extractor,
+ image_encoder=image_encoder,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+ self.control_image_processor = VaeImageProcessor(
+ vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+ )
+ add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+ if add_watermarker:
+ self.watermark = StableDiffusionXLWatermarker()
+ else:
+ self.watermark = None
+
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+ self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+
+ self.set_pag_applied_layers(pag_applied_layers)
+
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+ def encode_prompt(
+ self,
+ prompt: str,
+ prompt_2: Optional[str] = None,
+ device: Optional[torch.device] = None,
+ num_images_per_prompt: int = 1,
+ do_classifier_free_guidance: bool = True,
+ negative_prompt: Optional[str] = None,
+ negative_prompt_2: Optional[str] = None,
+ prompt_embeds: Optional[torch.Tensor] = None,
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+ lora_scale: Optional[float] = None,
+ clip_skip: Optional[int] = None,
+ ):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ prompt to be encoded
+ prompt_2 (`str` or `List[str]`, *optional*):
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+ used in both text-encoders
+ device: (`torch.device`):
+ torch device
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+ prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+ input argument.
+ lora_scale (`float`, *optional*):
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+ clip_skip (`int`, *optional*):
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+ the output of the pre-final layer will be used for computing the prompt embeddings.
+ """
+ device = device or self._execution_device
+
+ # set lora scale so that monkey patched LoRA
+ # function of text encoder can correctly access it
+ if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+ self._lora_scale = lora_scale
+
+ # dynamically adjust the LoRA scale
+ if self.text_encoder is not None:
+ if not USE_PEFT_BACKEND:
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+ else:
+ scale_lora_layers(self.text_encoder, lora_scale)
+
+ if self.text_encoder_2 is not None:
+ if not USE_PEFT_BACKEND:
+ adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+ else:
+ scale_lora_layers(self.text_encoder_2, lora_scale)
+
+ prompt = [prompt] if isinstance(prompt, str) else prompt
+
+ if prompt is not None:
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ # Define tokenizers and text encoders
+ tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+ text_encoders = (
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+ )
+
+ if prompt_embeds is None:
+ prompt_2 = prompt_2 or prompt
+ prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+ # textual inversion: process multi-vector tokens if necessary
+ prompt_embeds_list = []
+ prompts = [prompt, prompt_2]
+ for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+ if isinstance(self, TextualInversionLoaderMixin):
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+ text_inputs = tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+
+ text_input_ids = text_inputs.input_ids
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {tokenizer.model_max_length} tokens: {removed_text}"
+ )
+
+ prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+ # We are only ALWAYS interested in the pooled output of the final text encoder
+ pooled_prompt_embeds = prompt_embeds[0]
+ if clip_skip is None:
+ prompt_embeds = prompt_embeds.hidden_states[-2]
+ else:
+ # "2" because SDXL always indexes from the penultimate layer.
+ prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+ prompt_embeds_list.append(prompt_embeds)
+
+ prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+ # get unconditional embeddings for classifier free guidance
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
+ negative_prompt = negative_prompt or ""
+ negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+ # normalize str to list
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+ negative_prompt_2 = (
+ batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+ )
+
+ uncond_tokens: List[str]
+ if prompt is not None and type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = [negative_prompt, negative_prompt_2]
+
+ negative_prompt_embeds_list = []
+ for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+ if isinstance(self, TextualInversionLoaderMixin):
+ negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+ max_length = prompt_embeds.shape[1]
+ uncond_input = tokenizer(
+ negative_prompt,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+
+ negative_prompt_embeds = text_encoder(
+ uncond_input.input_ids.to(device),
+ output_hidden_states=True,
+ )
+ # We are only ALWAYS interested in the pooled output of the final text encoder
+ negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+ negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+ negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+ if self.text_encoder_2 is not None:
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+ else:
+ prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+ bs_embed, seq_len, _ = prompt_embeds.shape
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+ if do_classifier_free_guidance:
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = negative_prompt_embeds.shape[1]
+
+ if self.text_encoder_2 is not None:
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+ else:
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+ pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+ bs_embed * num_images_per_prompt, -1
+ )
+ if do_classifier_free_guidance:
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+ bs_embed * num_images_per_prompt, -1
+ )
+
+ if self.text_encoder is not None:
+ if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+ # Retrieve the original scale by scaling back the LoRA layers
+ unscale_lora_layers(self.text_encoder, lora_scale)
+
+ if self.text_encoder_2 is not None:
+ if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+ # Retrieve the original scale by scaling back the LoRA layers
+ unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+ def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+ dtype = next(self.image_encoder.parameters()).dtype
+
+ if not isinstance(image, torch.Tensor):
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+ image = image.to(device=device, dtype=dtype)
+ if output_hidden_states:
+ image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+ image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+ uncond_image_enc_hidden_states = self.image_encoder(
+ torch.zeros_like(image), output_hidden_states=True
+ ).hidden_states[-2]
+ uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+ num_images_per_prompt, dim=0
+ )
+ return image_enc_hidden_states, uncond_image_enc_hidden_states
+ else:
+ image_embeds = self.image_encoder(image).image_embeds
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+ uncond_image_embeds = torch.zeros_like(image_embeds)
+
+ return image_embeds, uncond_image_embeds
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+ def prepare_ip_adapter_image_embeds(
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+ ):
+ image_embeds = []
+ if do_classifier_free_guidance:
+ negative_image_embeds = []
+ if ip_adapter_image_embeds is None:
+ if not isinstance(ip_adapter_image, list):
+ ip_adapter_image = [ip_adapter_image]
+
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+ raise ValueError(
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+ )
+
+ for single_ip_adapter_image, image_proj_layer in zip(
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+ ):
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
+ single_ip_adapter_image, device, 1, output_hidden_state
+ )
+
+ image_embeds.append(single_image_embeds[None, :])
+ if do_classifier_free_guidance:
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
+ else:
+ for single_image_embeds in ip_adapter_image_embeds:
+ if do_classifier_free_guidance:
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+ negative_image_embeds.append(single_negative_image_embeds)
+ image_embeds.append(single_image_embeds)
+
+ ip_adapter_image_embeds = []
+ for i, single_image_embeds in enumerate(image_embeds):
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+ if do_classifier_free_guidance:
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+
+ single_image_embeds = single_image_embeds.to(device=device)
+ ip_adapter_image_embeds.append(single_image_embeds)
+
+ return ip_adapter_image_embeds
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+ def prepare_extra_step_kwargs(self, generator, eta):
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (ฮท) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to ฮท in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+ return extra_step_kwargs
+
+ # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl_img2img.StableDiffusionXLControlNetImg2ImgPipeline.check_inputs
+ def check_inputs(
+ self,
+ prompt,
+ prompt_2,
+ image,
+ strength,
+ num_inference_steps,
+ callback_steps,
+ negative_prompt=None,
+ negative_prompt_2=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ pooled_prompt_embeds=None,
+ negative_pooled_prompt_embeds=None,
+ ip_adapter_image=None,
+ ip_adapter_image_embeds=None,
+ controlnet_conditioning_scale=1.0,
+ control_guidance_start=0.0,
+ control_guidance_end=1.0,
+ callback_on_step_end_tensor_inputs=None,
+ ):
+ if strength < 0 or strength > 1:
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+ if num_inference_steps is None:
+ raise ValueError("`num_inference_steps` cannot be None.")
+ elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
+ raise ValueError(
+ f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
+ f" {type(num_inference_steps)}."
+ )
+
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ if callback_on_step_end_tensor_inputs is not None and not all(
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+ ):
+ raise ValueError(
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+ )
+
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt_2 is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
+ raise ValueError(
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+ )
+
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+ raise ValueError(
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+ )
+
+ # `prompt` needs more sophisticated handling when there are multiple
+ # conditionings.
+ if isinstance(self.controlnet, MultiControlNetModel):
+ if isinstance(prompt, list):
+ logger.warning(
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+ " prompts. The conditionings will be fixed across the prompts."
+ )
+
+ # Check `image`
+ is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+ self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+ )
+ if (
+ isinstance(self.controlnet, ControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
+ ):
+ self.check_image(image, prompt, prompt_embeds)
+ elif (
+ isinstance(self.controlnet, MultiControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+ ):
+ if not isinstance(image, list):
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+ # When `image` is a nested list:
+ # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+ elif any(isinstance(i, list) for i in image):
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+ elif len(image) != len(self.controlnet.nets):
+ raise ValueError(
+ f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+ )
+
+ for image_ in image:
+ self.check_image(image_, prompt, prompt_embeds)
+ else:
+ assert False
+
+ # Check `controlnet_conditioning_scale`
+ if (
+ isinstance(self.controlnet, ControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
+ ):
+ if not isinstance(controlnet_conditioning_scale, float):
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+ elif (
+ isinstance(self.controlnet, MultiControlNetModel)
+ or is_compiled
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+ ):
+ if isinstance(controlnet_conditioning_scale, list):
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+ self.controlnet.nets
+ ):
+ raise ValueError(
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+ " the same length as the number of controlnets"
+ )
+ else:
+ assert False
+
+ if not isinstance(control_guidance_start, (tuple, list)):
+ control_guidance_start = [control_guidance_start]
+
+ if not isinstance(control_guidance_end, (tuple, list)):
+ control_guidance_end = [control_guidance_end]
+
+ if len(control_guidance_start) != len(control_guidance_end):
+ raise ValueError(
+ f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+ )
+
+ if isinstance(self.controlnet, MultiControlNetModel):
+ if len(control_guidance_start) != len(self.controlnet.nets):
+ raise ValueError(
+ f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+ )
+
+ for start, end in zip(control_guidance_start, control_guidance_end):
+ if start >= end:
+ raise ValueError(
+ f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+ )
+ if start < 0.0:
+ raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+ if end > 1.0:
+ raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+ raise ValueError(
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+ )
+
+ if ip_adapter_image_embeds is not None:
+ if not isinstance(ip_adapter_image_embeds, list):
+ raise ValueError(
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+ )
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+ raise ValueError(
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+ )
+
+ # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_image
+ def check_image(self, image, prompt, prompt_embeds):
+ image_is_pil = isinstance(image, PIL.Image.Image)
+ image_is_tensor = isinstance(image, torch.Tensor)
+ image_is_np = isinstance(image, np.ndarray)
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+ image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+ if (
+ not image_is_pil
+ and not image_is_tensor
+ and not image_is_np
+ and not image_is_pil_list
+ and not image_is_tensor_list
+ and not image_is_np_list
+ ):
+ raise TypeError(
+ f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+ )
+
+ if image_is_pil:
+ image_batch_size = 1
+ else:
+ image_batch_size = len(image)
+
+ if prompt is not None and isinstance(prompt, str):
+ prompt_batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ prompt_batch_size = len(prompt)
+ elif prompt_embeds is not None:
+ prompt_batch_size = prompt_embeds.shape[0]
+
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+ raise ValueError(
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+ )
+
+ # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.prepare_image
+ def prepare_control_image(
+ self,
+ image,
+ width,
+ height,
+ batch_size,
+ num_images_per_prompt,
+ device,
+ dtype,
+ do_classifier_free_guidance=False,
+ guess_mode=False,
+ ):
+ image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+ image_batch_size = image.shape[0]
+
+ if image_batch_size == 1:
+ repeat_by = batch_size
+ else:
+ # image batch size is the same as prompt batch size
+ repeat_by = num_images_per_prompt
+
+ image = image.repeat_interleave(repeat_by, dim=0)
+
+ image = image.to(device=device, dtype=dtype)
+
+ if do_classifier_free_guidance and not guess_mode:
+ image = torch.cat([image] * 2)
+
+ return image
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+ def get_timesteps(self, num_inference_steps, strength, device):
+ # get the original timestep using init_timestep
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+ t_start = max(num_inference_steps - init_timestep, 0)
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+ if hasattr(self.scheduler, "set_begin_index"):
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+ return timesteps, num_inference_steps - t_start
+
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.prepare_latents
+ def prepare_latents(
+ self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
+ ):
+ if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+ raise ValueError(
+ f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+ )
+
+ latents_mean = latents_std = None
+ if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
+ latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
+ if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
+ latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
+
+ # Offload text encoder if `enable_model_cpu_offload` was enabled
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.text_encoder_2.to("cpu")
+ torch.cuda.empty_cache()
+
+ image = image.to(device=device, dtype=dtype)
+
+ batch_size = batch_size * num_images_per_prompt
+
+ if image.shape[1] == 4:
+ init_latents = image
+
+ else:
+ # make sure the VAE is in float32 mode, as it overflows in float16
+ if self.vae.config.force_upcast:
+ image = image.float()
+ self.vae.to(dtype=torch.float32)
+
+ if isinstance(generator, list) and len(generator) != batch_size:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ )
+
+ elif isinstance(generator, list):
+ if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+ image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+ elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+ raise ValueError(
+ f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+ )
+
+ init_latents = [
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+ for i in range(batch_size)
+ ]
+ init_latents = torch.cat(init_latents, dim=0)
+ else:
+ init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+ if self.vae.config.force_upcast:
+ self.vae.to(dtype)
+
+ init_latents = init_latents.to(dtype)
+ if latents_mean is not None and latents_std is not None:
+ latents_mean = latents_mean.to(device=device, dtype=dtype)
+ latents_std = latents_std.to(device=device, dtype=dtype)
+ init_latents = (init_latents - latents_mean) * self.vae.config.scaling_factor / latents_std
+ else:
+ init_latents = self.vae.config.scaling_factor * init_latents
+
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+ # expand init_latents for batch_size
+ additional_image_per_prompt = batch_size // init_latents.shape[0]
+ init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+ raise ValueError(
+ f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+ )
+ else:
+ init_latents = torch.cat([init_latents], dim=0)
+
+ if add_noise:
+ shape = init_latents.shape
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ # get latents
+ init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+ latents = init_latents
+
+ return latents
+
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
+ def _get_add_time_ids(
+ self,
+ original_size,
+ crops_coords_top_left,
+ target_size,
+ aesthetic_score,
+ negative_aesthetic_score,
+ negative_original_size,
+ negative_crops_coords_top_left,
+ negative_target_size,
+ dtype,
+ text_encoder_projection_dim=None,
+ ):
+ if self.config.requires_aesthetics_score:
+ add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+ add_neg_time_ids = list(
+ negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+ )
+ else:
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
+ add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+ passed_add_embed_dim = (
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+ )
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+ if (
+ expected_add_embed_dim > passed_add_embed_dim
+ and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+ ):
+ raise ValueError(
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+ )
+ elif (
+ expected_add_embed_dim < passed_add_embed_dim
+ and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+ ):
+ raise ValueError(
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+ )
+ elif expected_add_embed_dim != passed_add_embed_dim:
+ raise ValueError(
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+ )
+
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+ add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+ return add_time_ids, add_neg_time_ids
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+ def upcast_vae(self):
+ dtype = self.vae.dtype
+ self.vae.to(dtype=torch.float32)
+ use_torch_2_0_or_xformers = isinstance(
+ self.vae.decoder.mid_block.attentions[0].processor,
+ (
+ AttnProcessor2_0,
+ XFormersAttnProcessor,
+ ),
+ )
+ # if xformers or torch_2_0 is used attention block does not need
+ # to be in float32 which can save lots of memory
+ if use_torch_2_0_or_xformers:
+ self.vae.post_quant_conv.to(dtype)
+ self.vae.decoder.conv_in.to(dtype)
+ self.vae.decoder.mid_block.to(dtype)
+
+ @property
+ def guidance_scale(self):
+ return self._guidance_scale
+
+ @property
+ def clip_skip(self):
+ return self._clip_skip
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ @property
+ def do_classifier_free_guidance(self):
+ return self._guidance_scale > 1
+
+ @property
+ def cross_attention_kwargs(self):
+ return self._cross_attention_kwargs
+
+ @property
+ def num_timesteps(self):
+ return self._num_timesteps
+
+ @torch.no_grad()
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
+ def __call__(
+ self,
+ prompt: Union[str, List[str]] = None,
+ prompt_2: Optional[Union[str, List[str]]] = None,
+ image: PipelineImageInput = None,
+ control_image: PipelineImageInput = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ strength: float = 0.8,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 5.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.Tensor] = None,
+ prompt_embeds: Optional[torch.Tensor] = None,
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+ ip_adapter_image: Optional[PipelineImageInput] = None,
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
+ guess_mode: bool = False,
+ control_guidance_start: Union[float, List[float]] = 0.0,
+ control_guidance_end: Union[float, List[float]] = 1.0,
+ original_size: Tuple[int, int] = None,
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
+ target_size: Tuple[int, int] = None,
+ negative_original_size: Optional[Tuple[int, int]] = None,
+ negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+ negative_target_size: Optional[Tuple[int, int]] = None,
+ aesthetic_score: float = 6.0,
+ negative_aesthetic_score: float = 2.5,
+ clip_skip: Optional[int] = None,
+ callback_on_step_end: Optional[
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+ ] = None,
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+ pag_scale: float = 3.0,
+ pag_adaptive_scale: float = 0.0,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ prompt_2 (`str` or `List[str]`, *optional*):
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+ used in both text-encoders
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+ `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+ The initial image will be used as the starting point for the image generation process. Can also accept
+ image latents as `image`, if passing latents directly, it will not be encoded again.
+ control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+ `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+ The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+ the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also
+ be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+ and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in
+ init, images must be passed as a list such that each element of the list can be correctly batched for
+ input to a single controlnet.
+ height (`int`, *optional*, defaults to the size of control_image):
+ The height in pixels of the generated image. Anything below 512 pixels won't work well for
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+ and checkpoints that are not specifically fine-tuned on low resolutions.
+ width (`int`, *optional*, defaults to the size of control_image):
+ The width in pixels of the generated image. Anything below 512 pixels won't work well for
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+ and checkpoints that are not specifically fine-tuned on low resolutions.
+ strength (`float`, *optional*, defaults to 0.8):
+ Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+ starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+ on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+ process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+ essentially ignores `image`.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (ฮท) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ latents (`torch.Tensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+ input argument.
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+ The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+ to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
+ corresponding scale as a list.
+ guess_mode (`bool`, *optional*, defaults to `False`):
+ In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+ you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+ control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+ The percentage of total steps at which the controlnet starts applying.
+ control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+ The percentage of total steps at which the controlnet stops applying.
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+ `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+ explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
+ not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+ negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+ To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+ micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+ negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+ To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+ micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+ negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+ To negatively condition the generation process based on a target image resolution. It should be as same
+ as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+ aesthetic_score (`float`, *optional*, defaults to 6.0):
+ Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+ negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+ simulate an aesthetic score of the generated image by influencing the negative text condition.
+ clip_skip (`int`, *optional*):
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+ the output of the pre-final layer will be used for computing the prompt embeddings.
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+ `._callback_tensor_inputs` attribute of your pipeline class.
+ pag_scale (`float`, *optional*, defaults to 3.0):
+ The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
+ guidance will not be used.
+ pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
+ The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
+ used.
+
+
+ Examples:
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+ `tuple` containing the output images.
+ """
+
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+ controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+ # align format for control guidance
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+ mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+ control_guidance_start, control_guidance_end = (
+ mult * [control_guidance_start],
+ mult * [control_guidance_end],
+ )
+
+ # 1. Check inputs. Raise error if not correct
+ self.check_inputs(
+ prompt,
+ prompt_2,
+ control_image,
+ strength,
+ num_inference_steps,
+ None,
+ negative_prompt,
+ negative_prompt_2,
+ prompt_embeds,
+ negative_prompt_embeds,
+ pooled_prompt_embeds,
+ negative_pooled_prompt_embeds,
+ ip_adapter_image,
+ ip_adapter_image_embeds,
+ controlnet_conditioning_scale,
+ control_guidance_start,
+ control_guidance_end,
+ callback_on_step_end_tensor_inputs,
+ )
+
+ self._guidance_scale = guidance_scale
+ self._clip_skip = clip_skip
+ self._cross_attention_kwargs = cross_attention_kwargs
+ self._pag_scale = pag_scale
+ self._pag_adaptive_scale = pag_adaptive_scale
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ device = self._execution_device
+
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+ # 3.1 Encode input prompt
+ text_encoder_lora_scale = (
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+ )
+ (
+ prompt_embeds,
+ negative_prompt_embeds,
+ pooled_prompt_embeds,
+ negative_pooled_prompt_embeds,
+ ) = self.encode_prompt(
+ prompt,
+ prompt_2,
+ device,
+ num_images_per_prompt,
+ self.do_classifier_free_guidance,
+ negative_prompt,
+ negative_prompt_2,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ pooled_prompt_embeds=pooled_prompt_embeds,
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+ lora_scale=text_encoder_lora_scale,
+ clip_skip=self.clip_skip,
+ )
+
+ # 3.2 Encode ip_adapter_image
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+ ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
+ ip_adapter_image,
+ ip_adapter_image_embeds,
+ device,
+ batch_size * num_images_per_prompt,
+ self.do_classifier_free_guidance,
+ )
+
+ # 4. Prepare image and controlnet_conditioning_image
+ image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+
+ if isinstance(controlnet, ControlNetModel):
+ control_image = self.prepare_control_image(
+ image=control_image,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=controlnet.dtype,
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
+ guess_mode=False,
+ )
+ height, width = control_image.shape[-2:]
+ elif isinstance(controlnet, MultiControlNetModel):
+ control_images = []
+
+ for control_image_ in control_image:
+ control_image_ = self.prepare_control_image(
+ image=control_image_,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ device=device,
+ dtype=controlnet.dtype,
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
+ guess_mode=False,
+ )
+
+ control_images.append(control_image_)
+
+ control_image = control_images
+ height, width = control_image[0].shape[-2:]
+ else:
+ assert False
+
+ # 5. Prepare timesteps
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+ self._num_timesteps = len(timesteps)
+
+ # 6. Prepare latent variables
+ if latents is None:
+ latents = self.prepare_latents(
+ image,
+ latent_timestep,
+ batch_size,
+ num_images_per_prompt,
+ prompt_embeds.dtype,
+ device,
+ generator,
+ True,
+ )
+
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 7.1 Create tensor stating which controlnets to keep
+ controlnet_keep = []
+ for i in range(len(timesteps)):
+ keeps = [
+ 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+ for s, e in zip(control_guidance_start, control_guidance_end)
+ ]
+ controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+ # 7.2 Prepare added time ids & embeddings
+ if isinstance(control_image, list):
+ original_size = original_size or control_image[0].shape[-2:]
+ else:
+ original_size = original_size or control_image.shape[-2:]
+ target_size = target_size or (height, width)
+
+ if negative_original_size is None:
+ negative_original_size = original_size
+ if negative_target_size is None:
+ negative_target_size = target_size
+ add_text_embeds = pooled_prompt_embeds
+
+ if self.text_encoder_2 is None:
+ text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+ else:
+ text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+ add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+ original_size,
+ crops_coords_top_left,
+ target_size,
+ aesthetic_score,
+ negative_aesthetic_score,
+ negative_original_size,
+ negative_crops_coords_top_left,
+ negative_target_size,
+ dtype=prompt_embeds.dtype,
+ text_encoder_projection_dim=text_encoder_projection_dim,
+ )
+ add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+ add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+ control_images = control_image if isinstance(control_image, list) else [control_image]
+ for i, single_image in enumerate(control_images):
+ if self.do_classifier_free_guidance:
+ single_image = single_image.chunk(2)[0]
+
+ if self.do_perturbed_attention_guidance:
+ single_image = self._prepare_perturbed_attention_guidance(
+ single_image, single_image, self.do_classifier_free_guidance
+ )
+ elif self.do_classifier_free_guidance:
+ single_image = torch.cat([single_image] * 2)
+ single_image = single_image.to(device)
+ control_images[i] = single_image
+
+ control_image = control_images if isinstance(control_image, list) else control_images[0]
+
+ if ip_adapter_image_embeds is not None:
+ for i, image_embeds in enumerate(ip_adapter_image_embeds):
+ negative_image_embeds = None
+ if self.do_classifier_free_guidance:
+ negative_image_embeds, image_embeds = image_embeds.chunk(2)
+
+ if self.do_perturbed_attention_guidance:
+ image_embeds = self._prepare_perturbed_attention_guidance(
+ image_embeds, negative_image_embeds, self.do_classifier_free_guidance
+ )
+ elif self.do_classifier_free_guidance:
+ image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
+ image_embeds = image_embeds.to(device)
+ ip_adapter_image_embeds[i] = image_embeds
+
+ if self.do_perturbed_attention_guidance:
+ prompt_embeds = self._prepare_perturbed_attention_guidance(
+ prompt_embeds, negative_prompt_embeds, self.do_classifier_free_guidance
+ )
+ add_text_embeds = self._prepare_perturbed_attention_guidance(
+ add_text_embeds, negative_pooled_prompt_embeds, self.do_classifier_free_guidance
+ )
+ add_time_ids = self._prepare_perturbed_attention_guidance(
+ add_time_ids, add_neg_time_ids, self.do_classifier_free_guidance
+ )
+ elif self.do_classifier_free_guidance:
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+ add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+ prompt_embeds = prompt_embeds.to(device)
+ add_text_embeds = add_text_embeds.to(device)
+ add_time_ids = add_time_ids.to(device)
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+ controlnet_prompt_embeds = prompt_embeds
+ controlnet_added_cond_kwargs = added_cond_kwargs
+
+ # 8. Denoising loop
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+ if self.do_perturbed_attention_guidance:
+ original_attn_proc = self.unet.attn_processors
+ self._set_pag_attn_processor(
+ pag_applied_layers=self.pag_applied_layers,
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
+ )
+
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ # controlnet(s) inference
+ control_model_input = latent_model_input
+
+ if isinstance(controlnet_keep[i], list):
+ cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+ else:
+ controlnet_cond_scale = controlnet_conditioning_scale
+ if isinstance(controlnet_cond_scale, list):
+ controlnet_cond_scale = controlnet_cond_scale[0]
+ cond_scale = controlnet_cond_scale * controlnet_keep[i]
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
+ control_model_input,
+ t,
+ encoder_hidden_states=controlnet_prompt_embeds,
+ controlnet_cond=control_image,
+ conditioning_scale=cond_scale,
+ guess_mode=False,
+ added_cond_kwargs=controlnet_added_cond_kwargs,
+ return_dict=False,
+ )
+
+ if ip_adapter_image_embeds is not None:
+ added_cond_kwargs["image_embeds"] = ip_adapter_image_embeds
+
+ # predict the noise residual
+ noise_pred = self.unet(
+ latent_model_input,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ cross_attention_kwargs=self.cross_attention_kwargs,
+ down_block_additional_residuals=down_block_res_samples,
+ mid_block_additional_residual=mid_block_res_sample,
+ added_cond_kwargs=added_cond_kwargs,
+ return_dict=False,
+ )[0]
+
+ # perform guidance
+ if self.do_perturbed_attention_guidance:
+ noise_pred = self._apply_perturbed_attention_guidance(
+ noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t
+ )
+ elif self.do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+ if callback_on_step_end is not None:
+ callback_kwargs = {}
+ for k in callback_on_step_end_tensor_inputs:
+ callback_kwargs[k] = locals()[k]
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+ latents = callback_outputs.pop("latents", latents)
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+ add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+ negative_pooled_prompt_embeds = callback_outputs.pop(
+ "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+ )
+ add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+ add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ progress_bar.update()
+
+ # If we do sequential model offloading, let's offload unet and controlnet
+ # manually for max memory savings
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+ self.unet.to("cpu")
+ self.controlnet.to("cpu")
+ torch.cuda.empty_cache()
+
+ if not output_type == "latent":
+ # make sure the VAE is in float32 mode, as it overflows in float16
+ needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+ if needs_upcasting:
+ self.upcast_vae()
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+ # unscale/denormalize the latents
+ # denormalize with the mean and std if available and not None
+ has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+ has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+ if has_latents_mean and has_latents_std:
+ latents_mean = (
+ torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+ )
+ latents_std = (
+ torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+ )
+ latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+ else:
+ latents = latents / self.vae.config.scaling_factor
+
+ image = self.vae.decode(latents, return_dict=False)[0]
+
+ # cast back to fp16 if needed
+ if needs_upcasting:
+ self.vae.to(dtype=torch.float16)
+ else:
+ image = latents
+ return StableDiffusionXLPipelineOutput(images=image)
+
+ # apply watermark if available
+ if self.watermark is not None:
+ image = self.watermark.apply_watermark(image)
+
+ image = self.image_processor.postprocess(image, output_type=output_type)
+
+ # Offload all models
+ self.maybe_free_model_hooks()
+
+ if self.do_perturbed_attention_guidance:
+ self.unet.set_attn_processor(original_attn_proc)
+
+ if not return_dict:
+ return (image,)
+
+ return StableDiffusionXLPipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py
index 3cb45786731d..b12ce5dc3e88 100644
--- a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py
@@ -1471,6 +1471,14 @@ def denoising_value_valid(dnv):
generator,
self.do_classifier_free_guidance,
)
+ if self.do_perturbed_attention_guidance:
+ if self.do_classifier_free_guidance:
+ mask, _ = mask.chunk(2)
+ masked_image_latents, _ = masked_image_latents.chunk(2)
+ mask = self._prepare_perturbed_attention_guidance(mask, mask, self.do_classifier_free_guidance)
+ masked_image_latents = self._prepare_perturbed_attention_guidance(
+ masked_image_latents, masked_image_latents, self.do_classifier_free_guidance
+ )
# 8. Check that sizes of mask, masked image and latents match
if num_channels_unet == 9:
@@ -1659,10 +1667,10 @@ def denoising_value_valid(dnv):
if num_channels_unet == 4:
init_latents_proper = image_latents
- if self.do_classifier_free_guidance:
- init_mask, _ = mask.chunk(2)
+ if self.do_perturbed_attention_guidance:
+ init_mask, *_ = mask.chunk(3) if self.do_classifier_free_guidance else mask.chunk(2)
else:
- init_mask = mask
+ init_mask, *_ = mask.chunk(2) if self.do_classifier_free_guidance else mask
if i < len(timesteps) - 1:
noise_timestep = timesteps[i + 1]
diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py
index a0af28803d79..318599f56063 100644
--- a/src/diffusers/pipelines/pipeline_loading_utils.py
+++ b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -22,7 +22,7 @@
from typing import Any, Dict, List, Optional, Union
import torch
-from huggingface_hub import model_info
+from huggingface_hub import ModelCard, model_info
from huggingface_hub.utils import validate_hf_hub_args
from packaging import version
@@ -33,6 +33,7 @@
ONNX_WEIGHTS_NAME,
SAFETENSORS_WEIGHTS_NAME,
WEIGHTS_NAME,
+ deprecate,
get_class_from_dynamic_module,
is_accelerate_available,
is_peft_available,
@@ -89,49 +90,46 @@
ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library])
-def is_safetensors_compatible(filenames, variant=None, passed_components=None) -> bool:
+def is_safetensors_compatible(filenames, passed_components=None, folder_names=None) -> bool:
"""
Checking for safetensors compatibility:
- - By default, all models are saved with the default pytorch serialization, so we use the list of default pytorch
- files to know which safetensors files are needed.
- - The model is safetensors compatible only if there is a matching safetensors file for every default pytorch file.
+ - The model is safetensors compatible only if there is a safetensors file for each model component present in
+ filenames.
Converting default pytorch serialized filenames to safetensors serialized filenames:
- For models from the diffusers library, just replace the ".bin" extension with ".safetensors"
- For models from the transformers library, the filename changes from "pytorch_model" to "model", and the ".bin"
extension is replaced with ".safetensors"
"""
- pt_filenames = []
-
- sf_filenames = set()
-
passed_components = passed_components or []
+ if folder_names is not None:
+ filenames = {f for f in filenames if os.path.split(f)[0] in folder_names}
+ # extract all components of the pipeline and their associated files
+ components = {}
for filename in filenames:
- _, extension = os.path.splitext(filename)
+ if not len(filename.split("/")) == 2:
+ continue
- if len(filename.split("/")) == 2 and filename.split("/")[0] in passed_components:
+ component, component_filename = filename.split("/")
+ if component in passed_components:
continue
- if extension == ".bin":
- pt_filenames.append(os.path.normpath(filename))
- elif extension == ".safetensors":
- sf_filenames.add(os.path.normpath(filename))
+ components.setdefault(component, [])
+ components[component].append(component_filename)
- for filename in pt_filenames:
- # filename = 'foo/bar/baz.bam' -> path = 'foo/bar', filename = 'baz', extension = '.bam'
- path, filename = os.path.split(filename)
- filename, extension = os.path.splitext(filename)
+ # iterate over all files of a component
+ # check if safetensor files exist for that component
+ # if variant is provided check if the variant of the safetensors exists
+ for component, component_filenames in components.items():
+ matches = []
+ for component_filename in component_filenames:
+ filename, extension = os.path.splitext(component_filename)
- if filename.startswith("pytorch_model"):
- filename = filename.replace("pytorch_model", "model")
- else:
- filename = filename
+ match_exists = extension == ".safetensors"
+ matches.append(match_exists)
- expected_sf_filename = os.path.normpath(os.path.join(path, filename))
- expected_sf_filename = f"{expected_sf_filename}.safetensors"
- if expected_sf_filename not in sf_filenames:
- logger.warning(f"{expected_sf_filename} not found")
+ if not any(matches):
return False
return True
@@ -749,3 +747,92 @@ def _fetch_class_library_tuple(module):
class_name = not_compiled_module.__class__.__name__
return (library, class_name)
+
+
+def _identify_model_variants(folder: str, variant: str, config: dict) -> dict:
+ model_variants = {}
+ if variant is not None:
+ for sub_folder in os.listdir(folder):
+ folder_path = os.path.join(folder, sub_folder)
+ is_folder = os.path.isdir(folder_path) and sub_folder in config
+ variant_exists = is_folder and any(p.split(".")[1].startswith(variant) for p in os.listdir(folder_path))
+ if variant_exists:
+ model_variants[sub_folder] = variant
+ return model_variants
+
+
+def _resolve_custom_pipeline_and_cls(folder, config, custom_pipeline):
+ custom_class_name = None
+ if os.path.isfile(os.path.join(folder, f"{custom_pipeline}.py")):
+ custom_pipeline = os.path.join(folder, f"{custom_pipeline}.py")
+ elif isinstance(config["_class_name"], (list, tuple)) and os.path.isfile(
+ os.path.join(folder, f"{config['_class_name'][0]}.py")
+ ):
+ custom_pipeline = os.path.join(folder, f"{config['_class_name'][0]}.py")
+ custom_class_name = config["_class_name"][1]
+
+ return custom_pipeline, custom_class_name
+
+
+def _maybe_raise_warning_for_inpainting(pipeline_class, pretrained_model_name_or_path: str, config: dict):
+ if pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and version.parse(
+ version.parse(config["_diffusers_version"]).base_version
+ ) <= version.parse("0.5.1"):
+ from diffusers import StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy
+
+ pipeline_class = StableDiffusionInpaintPipelineLegacy
+
+ deprecation_message = (
+ "You are using a legacy checkpoint for inpainting with Stable Diffusion, therefore we are loading the"
+ f" {StableDiffusionInpaintPipelineLegacy} class instead of {StableDiffusionInpaintPipeline}. For"
+ " better inpainting results, we strongly suggest using Stable Diffusion's official inpainting"
+ " checkpoint: https://huggingface.co/runwayml/stable-diffusion-inpainting instead or adapting your"
+ f" checkpoint {pretrained_model_name_or_path} to the format of"
+ " https://huggingface.co/runwayml/stable-diffusion-inpainting. Note that we do not actively maintain"
+ " the {StableDiffusionInpaintPipelineLegacy} class and will likely remove it in version 1.0.0."
+ )
+ deprecate("StableDiffusionInpaintPipelineLegacy", "1.0.0", deprecation_message, standard_warn=False)
+
+
+def _update_init_kwargs_with_connected_pipeline(
+ init_kwargs: dict, passed_pipe_kwargs: dict, passed_class_objs: dict, folder: str, **pipeline_loading_kwargs
+) -> dict:
+ from .pipeline_utils import DiffusionPipeline
+
+ modelcard = ModelCard.load(os.path.join(folder, "README.md"))
+ connected_pipes = {prefix: getattr(modelcard.data, prefix, [None])[0] for prefix in CONNECTED_PIPES_KEYS}
+
+ # We don't scheduler argument to match the existing logic:
+ # https://github.com/huggingface/diffusers/blob/867e0c919e1aa7ef8b03c8eb1460f4f875a683ae/src/diffusers/pipelines/pipeline_utils.py#L906C13-L925C14
+ pipeline_loading_kwargs_cp = pipeline_loading_kwargs.copy()
+ if pipeline_loading_kwargs_cp is not None and len(pipeline_loading_kwargs_cp) >= 1:
+ for k in pipeline_loading_kwargs:
+ if "scheduler" in k:
+ _ = pipeline_loading_kwargs_cp.pop(k)
+
+ def get_connected_passed_kwargs(prefix):
+ connected_passed_class_obj = {
+ k.replace(f"{prefix}_", ""): w for k, w in passed_class_objs.items() if k.split("_")[0] == prefix
+ }
+ connected_passed_pipe_kwargs = {
+ k.replace(f"{prefix}_", ""): w for k, w in passed_pipe_kwargs.items() if k.split("_")[0] == prefix
+ }
+
+ connected_passed_kwargs = {**connected_passed_class_obj, **connected_passed_pipe_kwargs}
+ return connected_passed_kwargs
+
+ connected_pipes = {
+ prefix: DiffusionPipeline.from_pretrained(
+ repo_id, **pipeline_loading_kwargs_cp, **get_connected_passed_kwargs(prefix)
+ )
+ for prefix, repo_id in connected_pipes.items()
+ if repo_id is not None
+ }
+
+ for prefix, connected_pipe in connected_pipes.items():
+ # add connected pipes to `init_kwargs` with _, e.g. "prior_text_encoder"
+ init_kwargs.update(
+ {"_".join([prefix, name]): component for name, component in connected_pipe.components.items()}
+ )
+
+ return init_kwargs
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 2cc9defc3ffa..aa6da17edfe7 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -75,7 +75,11 @@
_get_custom_pipeline_class,
_get_final_device_map,
_get_pipeline_class,
+ _identify_model_variants,
+ _maybe_raise_warning_for_inpainting,
+ _resolve_custom_pipeline_and_cls,
_unwrap_model,
+ _update_init_kwargs_with_connected_pipeline,
is_safetensors_compatible,
load_sub_model,
maybe_raise_or_warn,
@@ -622,6 +626,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
>>> pipeline.scheduler = scheduler
```
"""
+ # Copy the kwargs to re-use during loading connected pipeline.
+ kwargs_copied = kwargs.copy()
+
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
proxies = kwargs.pop("proxies", None)
@@ -722,33 +729,19 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
config_dict.pop("_ignore_files", None)
# 2. Define which model components should load variants
- # We retrieve the information by matching whether variant
- # model checkpoints exist in the subfolders
- model_variants = {}
- if variant is not None:
- for folder in os.listdir(cached_folder):
- folder_path = os.path.join(cached_folder, folder)
- is_folder = os.path.isdir(folder_path) and folder in config_dict
- variant_exists = is_folder and any(
- p.split(".")[1].startswith(variant) for p in os.listdir(folder_path)
- )
- if variant_exists:
- model_variants[folder] = variant
+ # We retrieve the information by matching whether variant model checkpoints exist in the subfolders.
+ # Example: `diffusion_pytorch_model.safetensors` -> `diffusion_pytorch_model.fp16.safetensors`
+ # with variant being `"fp16"`.
+ model_variants = _identify_model_variants(folder=cached_folder, variant=variant, config=config_dict)
# 3. Load the pipeline class, if using custom module then load it from the hub
# if we load from explicit class, let's use it
- custom_class_name = None
- if os.path.isfile(os.path.join(cached_folder, f"{custom_pipeline}.py")):
- custom_pipeline = os.path.join(cached_folder, f"{custom_pipeline}.py")
- elif isinstance(config_dict["_class_name"], (list, tuple)) and os.path.isfile(
- os.path.join(cached_folder, f"{config_dict['_class_name'][0]}.py")
- ):
- custom_pipeline = os.path.join(cached_folder, f"{config_dict['_class_name'][0]}.py")
- custom_class_name = config_dict["_class_name"][1]
-
+ custom_pipeline, custom_class_name = _resolve_custom_pipeline_and_cls(
+ folder=cached_folder, config=config_dict, custom_pipeline=custom_pipeline
+ )
pipeline_class = _get_pipeline_class(
cls,
- config_dict,
+ config=config_dict,
load_connected_pipeline=load_connected_pipeline,
custom_pipeline=custom_pipeline,
class_name=custom_class_name,
@@ -760,23 +753,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
raise NotImplementedError("`device_map` is not yet supported for connected pipelines.")
# DEPRECATED: To be removed in 1.0.0
- if pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and version.parse(
- version.parse(config_dict["_diffusers_version"]).base_version
- ) <= version.parse("0.5.1"):
- from diffusers import StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy
-
- pipeline_class = StableDiffusionInpaintPipelineLegacy
-
- deprecation_message = (
- "You are using a legacy checkpoint for inpainting with Stable Diffusion, therefore we are loading the"
- f" {StableDiffusionInpaintPipelineLegacy} class instead of {StableDiffusionInpaintPipeline}. For"
- " better inpainting results, we strongly suggest using Stable Diffusion's official inpainting"
- " checkpoint: https://huggingface.co/runwayml/stable-diffusion-inpainting instead or adapting your"
- f" checkpoint {pretrained_model_name_or_path} to the format of"
- " https://huggingface.co/runwayml/stable-diffusion-inpainting. Note that we do not actively maintain"
- " the {StableDiffusionInpaintPipelineLegacy} class and will likely remove it in version 1.0.0."
- )
- deprecate("StableDiffusionInpaintPipelineLegacy", "1.0.0", deprecation_message, standard_warn=False)
+ # we are deprecating the `StableDiffusionInpaintPipelineLegacy` pipeline which gets loaded
+ # when a user requests for a `StableDiffusionInpaintPipeline` with `diffusers` version being <= 0.5.1.
+ _maybe_raise_warning_for_inpainting(
+ pipeline_class=pipeline_class,
+ pretrained_model_name_or_path=pretrained_model_name_or_path,
+ config=config_dict,
+ )
# 4. Define expected modules given pipeline signature
# and define non-None initialized modules (=`init_kwargs`)
@@ -787,7 +770,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
-
init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
# define init kwargs and make sure that optional component modules are filtered out
@@ -847,6 +829,7 @@ def load_module(name, value):
# 7. Load each module in the pipeline
current_device_map = None
for name, (library_name, class_name) in logging.tqdm(init_dict.items(), desc="Loading pipeline components..."):
+ # 7.1 device_map shenanigans
if final_device_map is not None and len(final_device_map) > 0:
component_device = final_device_map.get(name, None)
if component_device is not None:
@@ -854,15 +837,15 @@ def load_module(name, value):
else:
current_device_map = None
- # 7.1 - now that JAX/Flax is an official framework of the library, we might load from Flax names
+ # 7.2 - now that JAX/Flax is an official framework of the library, we might load from Flax names
class_name = class_name[4:] if class_name.startswith("Flax") else class_name
- # 7.2 Define all importable classes
+ # 7.3 Define all importable classes
is_pipeline_module = hasattr(pipelines, library_name)
importable_classes = ALL_IMPORTABLE_CLASSES
loaded_sub_model = None
- # 7.3 Use passed sub model or load class_name from library_name
+ # 7.4 Use passed sub model or load class_name from library_name
if name in passed_class_obj:
# if the model is in a pipeline module, then we load it from the pipeline
# check that passed_class_obj has correct parent class
@@ -900,56 +883,17 @@ def load_module(name, value):
init_kwargs[name] = loaded_sub_model # UNet(...), # DiffusionSchedule(...)
+ # 8. Handle connected pipelines.
if pipeline_class._load_connected_pipes and os.path.isfile(os.path.join(cached_folder, "README.md")):
- modelcard = ModelCard.load(os.path.join(cached_folder, "README.md"))
- connected_pipes = {prefix: getattr(modelcard.data, prefix, [None])[0] for prefix in CONNECTED_PIPES_KEYS}
- load_kwargs = {
- "cache_dir": cache_dir,
- "force_download": force_download,
- "proxies": proxies,
- "local_files_only": local_files_only,
- "token": token,
- "revision": revision,
- "torch_dtype": torch_dtype,
- "custom_pipeline": custom_pipeline,
- "custom_revision": custom_revision,
- "provider": provider,
- "sess_options": sess_options,
- "device_map": device_map,
- "max_memory": max_memory,
- "offload_folder": offload_folder,
- "offload_state_dict": offload_state_dict,
- "low_cpu_mem_usage": low_cpu_mem_usage,
- "variant": variant,
- "use_safetensors": use_safetensors,
- }
-
- def get_connected_passed_kwargs(prefix):
- connected_passed_class_obj = {
- k.replace(f"{prefix}_", ""): w for k, w in passed_class_obj.items() if k.split("_")[0] == prefix
- }
- connected_passed_pipe_kwargs = {
- k.replace(f"{prefix}_", ""): w for k, w in passed_pipe_kwargs.items() if k.split("_")[0] == prefix
- }
-
- connected_passed_kwargs = {**connected_passed_class_obj, **connected_passed_pipe_kwargs}
- return connected_passed_kwargs
-
- connected_pipes = {
- prefix: DiffusionPipeline.from_pretrained(
- repo_id, **load_kwargs.copy(), **get_connected_passed_kwargs(prefix)
- )
- for prefix, repo_id in connected_pipes.items()
- if repo_id is not None
- }
-
- for prefix, connected_pipe in connected_pipes.items():
- # add connected pipes to `init_kwargs` with _, e.g. "prior_text_encoder"
- init_kwargs.update(
- {"_".join([prefix, name]): component for name, component in connected_pipe.components.items()}
- )
+ init_kwargs = _update_init_kwargs_with_connected_pipeline(
+ init_kwargs=init_kwargs,
+ passed_pipe_kwargs=passed_pipe_kwargs,
+ passed_class_objs=passed_class_obj,
+ folder=cached_folder,
+ **kwargs_copied,
+ )
- # 8. Potentially add passed objects if expected
+ # 9. Potentially add passed objects if expected
missing_modules = set(expected_modules) - set(init_kwargs.keys())
passed_modules = list(passed_class_obj.keys())
optional_modules = pipeline_class._optional_components
@@ -1417,7 +1361,7 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
use_safetensors
and not allow_pickle
and not is_safetensors_compatible(
- model_filenames, variant=variant, passed_components=passed_components
+ model_filenames, passed_components=passed_components, folder_names=model_folder_names
)
):
raise EnvironmentError(
@@ -1426,7 +1370,7 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
if from_flax:
ignore_patterns = ["*.bin", "*.safetensors", "*.onnx", "*.pb"]
elif use_safetensors and is_safetensors_compatible(
- model_filenames, variant=variant, passed_components=passed_components
+ model_filenames, passed_components=passed_components, folder_names=model_folder_names
):
ignore_patterns = ["*.bin", "*.msgpack"]
diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
index fd644a5dfdba..cd6585ea20f2 100644
--- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
+++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
@@ -281,6 +281,16 @@ def do_classifier_free_guidance(self):
def num_timesteps(self):
return self._num_timesteps
+ def get_timestep_ratio_conditioning(self, t, alphas_cumprod):
+ s = torch.tensor([0.008])
+ clamp_range = [0, 1]
+ min_var = torch.cos(s / (1 + s) * torch.pi * 0.5) ** 2
+ var = alphas_cumprod[t]
+ var = var.clamp(*clamp_range)
+ s, min_var = s.to(var.device), min_var.to(var.device)
+ ratio = (((var * min_var) ** 0.5).acos() / (torch.pi * 0.5)) * (1 + s) - s
+ return ratio
+
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
@@ -434,10 +444,30 @@ def __call__(
batch_size, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, self.scheduler
)
+ if isinstance(self.scheduler, DDPMWuerstchenScheduler):
+ timesteps = timesteps[:-1]
+ else:
+ if hasattr(self.scheduler.config, "clip_sample") and self.scheduler.config.clip_sample:
+ self.scheduler.config.clip_sample = False # disample sample clipping
+ logger.warning(" set `clip_sample` to be False")
+
# 6. Run denoising loop
- self._num_timesteps = len(timesteps[:-1])
- for i, t in enumerate(self.progress_bar(timesteps[:-1])):
- timestep_ratio = t.expand(latents.size(0)).to(dtype)
+ if hasattr(self.scheduler, "betas"):
+ alphas = 1.0 - self.scheduler.betas
+ alphas_cumprod = torch.cumprod(alphas, dim=0)
+ else:
+ alphas_cumprod = []
+
+ self._num_timesteps = len(timesteps)
+ for i, t in enumerate(self.progress_bar(timesteps)):
+ if not isinstance(self.scheduler, DDPMWuerstchenScheduler):
+ if len(alphas_cumprod) > 0:
+ timestep_ratio = self.get_timestep_ratio_conditioning(t.long().cpu(), alphas_cumprod)
+ timestep_ratio = timestep_ratio.expand(latents.size(0)).to(dtype).to(device)
+ else:
+ timestep_ratio = t.float().div(self.scheduler.timesteps[-1]).expand(latents.size(0)).to(dtype)
+ else:
+ timestep_ratio = t.expand(latents.size(0)).to(dtype)
# 7. Denoise latents
predicted_latents = self.decoder(
@@ -454,6 +484,8 @@ def __call__(
predicted_latents = torch.lerp(predicted_latents_uncond, predicted_latents_text, self.guidance_scale)
# 9. Renoise latents to next timestep
+ if not isinstance(self.scheduler, DDPMWuerstchenScheduler):
+ timestep_ratio = t
latents = self.scheduler.step(
model_output=predicted_latents,
timestep=timestep_ratio,
diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
index 1e9e33a42030..8c747c9cf8c6 100644
--- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
+++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
@@ -353,7 +353,7 @@ def num_timesteps(self):
return self._num_timesteps
def get_timestep_ratio_conditioning(self, t, alphas_cumprod):
- s = torch.tensor([0.003])
+ s = torch.tensor([0.008])
clamp_range = [0, 1]
min_var = torch.cos(s / (1 + s) * torch.pi * 0.5) ** 2
var = alphas_cumprod[t]
@@ -557,7 +557,7 @@ def __call__(
if isinstance(self.scheduler, DDPMWuerstchenScheduler):
timesteps = timesteps[:-1]
else:
- if self.scheduler.config.clip_sample:
+ if hasattr(self.scheduler.config, "clip_sample") and self.scheduler.config.clip_sample:
self.scheduler.config.clip_sample = False # disample sample clipping
logger.warning(" set `clip_sample` to be False")
# 6. Run denoising loop
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
index 4d033133e5ec..ffe02ae679e5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -33,6 +33,20 @@
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+ return encoder_output.latent_dist.sample(generator)
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+ return encoder_output.latent_dist.mode()
+ elif hasattr(encoder_output, "latents"):
+ return encoder_output.latents
+ else:
+ raise AttributeError("Could not access latents of provided encoder_output")
+
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.preprocess
def preprocess(image):
warnings.warn(
@@ -105,7 +119,54 @@ def __init__(
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, resample="bicubic")
- def _encode_prompt(self, prompt, device, do_classifier_free_guidance, negative_prompt):
+ def _encode_prompt(
+ self,
+ prompt,
+ device,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[torch.Tensor] = None,
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+ **kwargs,
+ ):
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+ (
+ prompt_embeds,
+ negative_prompt_embeds,
+ pooled_prompt_embeds,
+ negative_pooled_prompt_embeds,
+ ) = self.encode_prompt(
+ prompt=prompt,
+ device=device,
+ do_classifier_free_guidance=do_classifier_free_guidance,
+ negative_prompt=negative_prompt,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ pooled_prompt_embeds=pooled_prompt_embeds,
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+ **kwargs,
+ )
+
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+ pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds])
+
+ return prompt_embeds, pooled_prompt_embeds
+
+ def encode_prompt(
+ self,
+ prompt,
+ device,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ prompt_embeds: Optional[torch.Tensor] = None,
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+ ):
r"""
Encodes the prompt into text encoder hidden states.
@@ -119,81 +180,100 @@ def _encode_prompt(self, prompt, device, do_classifier_free_guidance, negative_p
negative_prompt (`str` or `List[str]`):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
if `guidance_scale` is less than `1`).
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+ input argument.
"""
- batch_size = len(prompt) if isinstance(prompt, list) else 1
-
- text_inputs = self.tokenizer(
- prompt,
- padding="max_length",
- max_length=self.tokenizer.model_max_length,
- truncation=True,
- return_length=True,
- return_tensors="pt",
- )
- text_input_ids = text_inputs.input_ids
-
- untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
- if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
- removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
- logger.warning(
- "The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}"
- )
-
- text_encoder_out = self.text_encoder(
- text_input_ids.to(device),
- output_hidden_states=True,
- )
- text_embeddings = text_encoder_out.hidden_states[-1]
- text_pooler_out = text_encoder_out.pooler_output
-
- # get unconditional embeddings for classifier free guidance
- if do_classifier_free_guidance:
- uncond_tokens: List[str]
- if negative_prompt is None:
- uncond_tokens = [""] * batch_size
- elif type(prompt) is not type(negative_prompt):
- raise TypeError(
- f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}."
- )
- elif isinstance(negative_prompt, str):
- uncond_tokens = [negative_prompt]
- elif batch_size != len(negative_prompt):
- raise ValueError(
- f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
- f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`."
- )
- else:
- uncond_tokens = negative_prompt
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
- max_length = text_input_ids.shape[-1]
- uncond_input = self.tokenizer(
- uncond_tokens,
+ if prompt_embeds is None or pooled_prompt_embeds is None:
+ text_inputs = self.tokenizer(
+ prompt,
padding="max_length",
- max_length=max_length,
+ max_length=self.tokenizer.model_max_length,
truncation=True,
return_length=True,
return_tensors="pt",
)
+ text_input_ids = text_inputs.input_ids
- uncond_encoder_out = self.text_encoder(
- uncond_input.input_ids.to(device),
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = self.tokenizer.batch_decode(
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+ )
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+
+ text_encoder_out = self.text_encoder(
+ text_input_ids.to(device),
output_hidden_states=True,
)
+ prompt_embeds = text_encoder_out.hidden_states[-1]
+ pooled_prompt_embeds = text_encoder_out.pooler_output
- uncond_embeddings = uncond_encoder_out.hidden_states[-1]
- uncond_pooler_out = uncond_encoder_out.pooler_output
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance:
+ if negative_prompt_embeds is None or negative_pooled_prompt_embeds is None:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt]
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ max_length = text_input_ids.shape[-1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_length=True,
+ return_tensors="pt",
+ )
+
+ uncond_encoder_out = self.text_encoder(
+ uncond_input.input_ids.to(device),
+ output_hidden_states=True,
+ )
- # For classifier free guidance, we need to do two forward passes.
- # Here we concatenate the unconditional and text embeddings into a single batch
- # to avoid doing two forward passes
- text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
- text_pooler_out = torch.cat([uncond_pooler_out, text_pooler_out])
+ negative_prompt_embeds = uncond_encoder_out.hidden_states[-1]
+ negative_pooled_prompt_embeds = uncond_encoder_out.pooler_output
- return text_embeddings, text_pooler_out
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
def decode_latents(self, latents):
@@ -207,12 +287,56 @@ def decode_latents(self, latents):
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
return image
- def check_inputs(self, prompt, image, callback_steps):
- if not isinstance(prompt, str) and not isinstance(prompt, list):
+ def check_inputs(
+ self,
+ prompt,
+ image,
+ callback_steps,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ pooled_prompt_embeds=None,
+ negative_pooled_prompt_embeds=None,
+ ):
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and not isinstance(prompt, str) and not isinstance(prompt, list):
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
+ raise ValueError(
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+ )
+
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+ raise ValueError(
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+ )
+
if (
not isinstance(image, torch.Tensor)
+ and not isinstance(image, np.ndarray)
and not isinstance(image, PIL.Image.Image)
and not isinstance(image, list)
):
@@ -222,10 +346,14 @@ def check_inputs(self, prompt, image, callback_steps):
# verify batch size of prompt and image are same if image is a list or tensor
if isinstance(image, (list, torch.Tensor)):
- if isinstance(prompt, str):
- batch_size = 1
+ if prompt is not None:
+ if isinstance(prompt, str):
+ batch_size = 1
+ else:
+ batch_size = len(prompt)
else:
- batch_size = len(prompt)
+ batch_size = prompt_embeds.shape[0]
+
if isinstance(image, list):
image_batch_size = len(image)
else:
@@ -261,13 +389,17 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
@torch.no_grad()
def __call__(
self,
- prompt: Union[str, List[str]],
+ prompt: Union[str, List[str]] = None,
image: PipelineImageInput = None,
num_inference_steps: int = 75,
guidance_scale: float = 9.0,
negative_prompt: Optional[Union[str, List[str]]] = None,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.Tensor] = None,
+ prompt_embeds: Optional[torch.Tensor] = None,
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
@@ -359,10 +491,22 @@ def __call__(
"""
# 1. Check inputs
- self.check_inputs(prompt, image, callback_steps)
+ self.check_inputs(
+ prompt,
+ image,
+ callback_steps,
+ negative_prompt,
+ prompt_embeds,
+ negative_prompt_embeds,
+ pooled_prompt_embeds,
+ negative_pooled_prompt_embeds,
+ )
# 2. Define call parameters
- batch_size = 1 if isinstance(prompt, str) else len(prompt)
+ if prompt is not None:
+ batch_size = 1 if isinstance(prompt, str) else len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
device = self._execution_device
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -373,16 +517,32 @@ def __call__(
prompt = [""] * batch_size
# 3. Encode input prompt
- text_embeddings, text_pooler_out = self._encode_prompt(
- prompt, device, do_classifier_free_guidance, negative_prompt
+ (
+ prompt_embeds,
+ negative_prompt_embeds,
+ pooled_prompt_embeds,
+ negative_pooled_prompt_embeds,
+ ) = self.encode_prompt(
+ prompt,
+ device,
+ do_classifier_free_guidance,
+ negative_prompt,
+ prompt_embeds,
+ negative_prompt_embeds,
+ pooled_prompt_embeds,
+ negative_pooled_prompt_embeds,
)
+ if do_classifier_free_guidance:
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+ pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds])
+
# 4. Preprocess image
image = self.image_processor.preprocess(image)
- image = image.to(dtype=text_embeddings.dtype, device=device)
+ image = image.to(dtype=prompt_embeds.dtype, device=device)
if image.shape[1] == 3:
# encode image if not in latent-space yet
- image = self.vae.encode(image).latent_dist.sample() * self.vae.config.scaling_factor
+ image = retrieve_latents(self.vae.encode(image), generator=generator) * self.vae.config.scaling_factor
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -400,17 +560,17 @@ def __call__(
inv_noise_level = (noise_level**2 + 1) ** (-0.5)
image_cond = F.interpolate(image, scale_factor=2, mode="nearest") * inv_noise_level[:, None, None, None]
- image_cond = image_cond.to(text_embeddings.dtype)
+ image_cond = image_cond.to(prompt_embeds.dtype)
noise_level_embed = torch.cat(
[
- torch.ones(text_pooler_out.shape[0], 64, dtype=text_pooler_out.dtype, device=device),
- torch.zeros(text_pooler_out.shape[0], 64, dtype=text_pooler_out.dtype, device=device),
+ torch.ones(pooled_prompt_embeds.shape[0], 64, dtype=pooled_prompt_embeds.dtype, device=device),
+ torch.zeros(pooled_prompt_embeds.shape[0], 64, dtype=pooled_prompt_embeds.dtype, device=device),
],
dim=1,
)
- timestep_condition = torch.cat([noise_level_embed, text_pooler_out], dim=1)
+ timestep_condition = torch.cat([noise_level_embed, pooled_prompt_embeds], dim=1)
# 6. Prepare latent variables
height, width = image.shape[2:]
@@ -420,7 +580,7 @@ def __call__(
num_channels_latents,
height * 2, # 2x upscale
width * 2,
- text_embeddings.dtype,
+ prompt_embeds.dtype,
device,
generator,
latents,
@@ -454,7 +614,7 @@ def __call__(
noise_pred = self.unet(
scaled_model_input,
timestep,
- encoder_hidden_states=text_embeddings,
+ encoder_hidden_states=prompt_embeds,
timestep_cond=timestep_condition,
).sample
diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
index 1e396cb2329f..122701ff923f 100755
--- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -602,9 +602,9 @@ def __call__(
sigma_min: float = self.k_diffusion_model.sigmas[0].item()
sigma_max: float = self.k_diffusion_model.sigmas[-1].item()
sigmas = get_sigmas_karras(n=num_inference_steps, sigma_min=sigma_min, sigma_max=sigma_max)
- sigmas = sigmas.to(device)
else:
sigmas = self.scheduler.sigmas
+ sigmas = sigmas.to(device)
sigmas = sigmas.to(prompt_embeds.dtype)
# 6. Prepare latent variables
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index e3d18121a151..477beed49f52 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -1637,6 +1637,21 @@ def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
+class StableDiffusionXLControlNetPAGImg2ImgPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+
class StableDiffusionXLControlNetPAGPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
diff --git a/src/diffusers/utils/loading_utils.py b/src/diffusers/utils/loading_utils.py
index ba6f7ccace3e..b36664cb81ff 100644
--- a/src/diffusers/utils/loading_utils.py
+++ b/src/diffusers/utils/loading_utils.py
@@ -1,6 +1,7 @@
import os
import tempfile
from typing import Callable, List, Optional, Union
+from urllib.parse import unquote, urlparse
import PIL.Image
import PIL.ImageOps
@@ -80,12 +81,22 @@ def load_video(
)
if is_url:
- video_data = requests.get(video, stream=True).raw
- suffix = os.path.splitext(video)[1] or ".mp4"
+ response = requests.get(video, stream=True)
+ if response.status_code != 200:
+ raise ValueError(f"Failed to download video. Status code: {response.status_code}")
+
+ parsed_url = urlparse(video)
+ file_name = os.path.basename(unquote(parsed_url.path))
+
+ suffix = os.path.splitext(file_name)[1] or ".mp4"
video_path = tempfile.NamedTemporaryFile(suffix=suffix, delete=False).name
+
was_tempfile_created = True
+
+ video_data = response.iter_content(chunk_size=8192)
with open(video_path, "wb") as f:
- f.write(video_data.read())
+ for chunk in video_data:
+ f.write(chunk)
video = video_path
diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py
index c0f0684ac4de..a82d37665fc5 100644
--- a/tests/lora/test_lora_layers_flux.py
+++ b/tests/lora/test_lora_layers_flux.py
@@ -12,19 +12,26 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import os
import sys
+import tempfile
import unittest
+import numpy as np
+import safetensors.torch
import torch
from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, T5EncoderModel
from diffusers import FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
-from diffusers.utils.testing_utils import floats_tensor, require_peft_backend
+from diffusers.utils.testing_utils import floats_tensor, is_peft_available, require_peft_backend, torch_device
+if is_peft_available():
+ from peft.utils import get_peft_model_state_dict
+
sys.path.append(".")
-from utils import PeftLoraLoaderMixinTests # noqa: E402
+from utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set # noqa: E402
@require_peft_backend
@@ -90,3 +97,51 @@ def get_dummy_inputs(self, with_generator=True):
pipeline_inputs.update({"generator": generator})
return noise, input_ids, pipeline_inputs
+
+ def test_with_alpha_in_state_dict(self):
+ components, _, denoiser_lora_config = self.get_dummy_components(FlowMatchEulerDiscreteScheduler)
+ pipe = self.pipeline_class(**components)
+ pipe = pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+ _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+ output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images
+ self.assertTrue(output_no_lora.shape == self.output_shape)
+
+ pipe.transformer.add_adapter(denoiser_lora_config)
+ self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer")
+
+ images_lora = pipe(**inputs, generator=torch.manual_seed(0)).images
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ denoiser_state_dict = get_peft_model_state_dict(pipe.transformer)
+ self.pipeline_class.save_lora_weights(tmpdirname, transformer_lora_layers=denoiser_state_dict)
+
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
+ pipe.unload_lora_weights()
+ pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))
+
+ # modify the state dict to have alpha values following
+ # https://huggingface.co/TheLastBen/Jon_Snow_Flux_LoRA/blob/main/jon_snow.safetensors
+ state_dict_with_alpha = safetensors.torch.load_file(
+ os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")
+ )
+ alpha_dict = {}
+ for k, v in state_dict_with_alpha.items():
+ # only do for `transformer` and for the k projections -- should be enough to test.
+ if "transformer" in k and "to_k" in k and "lora_A" in k:
+ alpha_dict[f"{k}.alpha"] = float(torch.randint(10, 100, size=()))
+ state_dict_with_alpha.update(alpha_dict)
+
+ images_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0)).images
+ self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in denoiser")
+
+ pipe.unload_lora_weights()
+ pipe.load_lora_weights(state_dict_with_alpha)
+ images_lora_with_alpha = pipe(**inputs, generator=torch.manual_seed(0)).images
+
+ self.assertTrue(
+ np.allclose(images_lora, images_lora_from_pretrained, atol=1e-3, rtol=1e-3),
+ "Loading from saved checkpoints should give same results.",
+ )
+ self.assertFalse(np.allclose(images_lora_with_alpha, images_lora, atol=1e-3, rtol=1e-3))
diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py
index 31c62f27a75a..063ff4c8b05d 100644
--- a/tests/lora/test_lora_layers_sd3.py
+++ b/tests/lora/test_lora_layers_sd3.py
@@ -32,7 +32,7 @@
@require_peft_backend
class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
pipeline_class = StableDiffusion3Pipeline
- scheduler_cls = FlowMatchEulerDiscreteScheduler()
+ scheduler_cls = FlowMatchEulerDiscreteScheduler
scheduler_kwargs = {}
uses_flow_matching = True
transformer_kwargs = {
@@ -80,8 +80,7 @@ def test_sd3_lora(self):
Related PR: https://github.com/huggingface/diffusers/pull/8584
"""
components = self.get_dummy_components()
-
- pipe = self.pipeline_class(**components)
+ pipe = self.pipeline_class(**components[0])
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
diff --git a/tests/lora/test_lora_layers_sdxl.py b/tests/lora/test_lora_layers_sdxl.py
index f00f7b193abf..4ec7ef897485 100644
--- a/tests/lora/test_lora_layers_sdxl.py
+++ b/tests/lora/test_lora_layers_sdxl.py
@@ -124,71 +124,6 @@ def tearDown(self):
gc.collect()
torch.cuda.empty_cache()
- def test_sdxl_0_9_lora_one(self):
- generator = torch.Generator().manual_seed(0)
-
- pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9")
- lora_model_id = "hf-internal-testing/sdxl-0.9-daiton-lora"
- lora_filename = "daiton-xl-lora-test.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
- pipe.enable_model_cpu_offload()
-
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
-
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.3838, 0.3482, 0.3588, 0.3162, 0.319, 0.3369, 0.338, 0.3366, 0.3213])
-
- max_diff = numpy_cosine_similarity_distance(expected, images)
- assert max_diff < 1e-3
- pipe.unload_lora_weights()
- release_memory(pipe)
-
- def test_sdxl_0_9_lora_two(self):
- generator = torch.Generator().manual_seed(0)
-
- pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9")
- lora_model_id = "hf-internal-testing/sdxl-0.9-costumes-lora"
- lora_filename = "saijo.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
- pipe.enable_model_cpu_offload()
-
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
-
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.3137, 0.3269, 0.3355, 0.255, 0.2577, 0.2563, 0.2679, 0.2758, 0.2626])
-
- max_diff = numpy_cosine_similarity_distance(expected, images)
- assert max_diff < 1e-3
-
- pipe.unload_lora_weights()
- release_memory(pipe)
-
- def test_sdxl_0_9_lora_three(self):
- generator = torch.Generator().manual_seed(0)
-
- pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9")
- lora_model_id = "hf-internal-testing/sdxl-0.9-kamepan-lora"
- lora_filename = "kame_sdxl_v2-000020-16rank.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
- pipe.enable_model_cpu_offload()
-
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
-
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.4015, 0.3761, 0.3616, 0.3745, 0.3462, 0.3337, 0.3564, 0.3649, 0.3468])
-
- max_diff = numpy_cosine_similarity_distance(expected, images)
- assert max_diff < 5e-3
-
- pipe.unload_lora_weights()
- release_memory(pipe)
-
def test_sdxl_1_0_lora(self):
generator = torch.Generator("cpu").manual_seed(0)
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 64722e2d9797..0ce01fb93f40 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -976,7 +976,6 @@ def test_sharded_checkpoints_device_map(self):
self.assertTrue(actual_num_shards == expected_num_shards)
new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto")
- new_model = new_model.to(torch_device)
torch.manual_seed(0)
if "generator" in inputs_dict:
diff --git a/tests/models/transformers/test_models_transformer_aura_flow.py b/tests/models/transformers/test_models_transformer_aura_flow.py
index 57fac4ba769c..376d8b57da4d 100644
--- a/tests/models/transformers/test_models_transformer_aura_flow.py
+++ b/tests/models/transformers/test_models_transformer_aura_flow.py
@@ -26,9 +26,11 @@
enable_full_determinism()
-class SD3TransformerTests(ModelTesterMixin, unittest.TestCase):
+class AuraFlowTransformerTests(ModelTesterMixin, unittest.TestCase):
model_class = AuraFlowTransformer2DModel
main_input_name = "hidden_states"
+ # We override the items here because the transformer under consideration is small.
+ model_split_percents = [0.7, 0.6, 0.6]
@property
def dummy_input(self):
@@ -71,3 +73,7 @@ def prepare_init_args_and_inputs_for_common(self):
}
inputs_dict = self.dummy_input
return init_dict, inputs_dict
+
+ @unittest.skip("AuraFlowTransformer2DModel uses its own dedicated attention processor. This test does not apply")
+ def test_set_attn_processor_for_determinism(self):
+ pass
diff --git a/tests/models/transformers/test_models_transformer_flux.py b/tests/models/transformers/test_models_transformer_flux.py
index d1c85537b00b..538d158cbcb9 100644
--- a/tests/models/transformers/test_models_transformer_flux.py
+++ b/tests/models/transformers/test_models_transformer_flux.py
@@ -29,6 +29,8 @@
class FluxTransformerTests(ModelTesterMixin, unittest.TestCase):
model_class = FluxTransformer2DModel
main_input_name = "hidden_states"
+ # We override the items here because the transformer under consideration is small.
+ model_split_percents = [0.7, 0.6, 0.6]
@property
def dummy_input(self):
@@ -42,8 +44,8 @@ def dummy_input(self):
hidden_states = torch.randn((batch_size, height * width, num_latent_channels)).to(torch_device)
encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
pooled_prompt_embeds = torch.randn((batch_size, embedding_dim)).to(torch_device)
- text_ids = torch.randn((batch_size, sequence_length, num_image_channels)).to(torch_device)
- image_ids = torch.randn((batch_size, height * width, num_image_channels)).to(torch_device)
+ text_ids = torch.randn((sequence_length, num_image_channels)).to(torch_device)
+ image_ids = torch.randn((height * width, num_image_channels)).to(torch_device)
timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
return {
@@ -78,3 +80,31 @@ def prepare_init_args_and_inputs_for_common(self):
inputs_dict = self.dummy_input
return init_dict, inputs_dict
+
+ def test_deprecated_inputs_img_txt_ids_3d(self):
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+ model = self.model_class(**init_dict)
+ model.to(torch_device)
+ model.eval()
+
+ with torch.no_grad():
+ output_1 = model(**inputs_dict).to_tuple()[0]
+
+ # update inputs_dict with txt_ids and img_ids as 3d tensors (deprecated)
+ text_ids_3d = inputs_dict["txt_ids"].unsqueeze(0)
+ image_ids_3d = inputs_dict["img_ids"].unsqueeze(0)
+
+ assert text_ids_3d.ndim == 3, "text_ids_3d should be a 3d tensor"
+ assert image_ids_3d.ndim == 3, "img_ids_3d should be a 3d tensor"
+
+ inputs_dict["txt_ids"] = text_ids_3d
+ inputs_dict["img_ids"] = image_ids_3d
+
+ with torch.no_grad():
+ output_2 = model(**inputs_dict).to_tuple()[0]
+
+ self.assertEqual(output_1.shape, output_2.shape)
+ self.assertTrue(
+ torch.allclose(output_1, output_2, atol=1e-5),
+ msg="output with deprecated inputs (img_ids and txt_ids as 3d torch tensors) are not equal as them as 2d inputs",
+ )
diff --git a/tests/models/transformers/test_models_transformer_lumina.py b/tests/models/transformers/test_models_transformer_lumina.py
new file mode 100644
index 000000000000..0b3e666999e9
--- /dev/null
+++ b/tests/models/transformers/test_models_transformer_lumina.py
@@ -0,0 +1,111 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import LuminaNextDiT2DModel
+from diffusers.utils.testing_utils import (
+ enable_full_determinism,
+ torch_device,
+)
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class LuminaNextDiT2DModelTransformerTests(ModelTesterMixin, unittest.TestCase):
+ model_class = LuminaNextDiT2DModel
+ main_input_name = "hidden_states"
+
+ @property
+ def dummy_input(self):
+ """
+ Args:
+ None
+ Returns:
+ Dict: Dictionary of dummy input tensors
+ """
+ batch_size = 2 # N
+ num_channels = 4 # C
+ height = width = 16 # H, W
+ embedding_dim = 32 # D
+ sequence_length = 16 # L
+
+ hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
+ encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+ timestep = torch.rand(size=(batch_size,)).to(torch_device)
+ encoder_mask = torch.randn(size=(batch_size, sequence_length)).to(torch_device)
+ image_rotary_emb = torch.randn((384, 384, 4)).to(torch_device)
+
+ return {
+ "hidden_states": hidden_states,
+ "encoder_hidden_states": encoder_hidden_states,
+ "timestep": timestep,
+ "encoder_mask": encoder_mask,
+ "image_rotary_emb": image_rotary_emb,
+ "cross_attention_kwargs": {},
+ }
+
+ @property
+ def input_shape(self):
+ """
+ Args:
+ None
+ Returns:
+ Tuple: (int, int, int)
+ """
+ return (4, 16, 16)
+
+ @property
+ def output_shape(self):
+ """
+ Args:
+ None
+ Returns:
+ Tuple: (int, int, int)
+ """
+ return (4, 16, 16)
+
+ def prepare_init_args_and_inputs_for_common(self):
+ """
+ Args:
+ None
+
+ Returns:
+ Tuple: (Dict, Dict)
+ """
+ init_dict = {
+ "sample_size": 16,
+ "patch_size": 2,
+ "in_channels": 4,
+ "hidden_size": 24,
+ "num_layers": 2,
+ "num_attention_heads": 3,
+ "num_kv_heads": 1,
+ "multiple_of": 16,
+ "ffn_dim_multiplier": None,
+ "norm_eps": 1e-5,
+ "learn_sigma": False,
+ "qk_norm": True,
+ "cross_attention_dim": 32,
+ "scaling_factor": 1.0,
+ }
+
+ inputs_dict = self.dummy_input
+ return init_dict, inputs_dict
diff --git a/tests/models/transformers/test_models_transformer_sd3.py b/tests/models/transformers/test_models_transformer_sd3.py
index 9c927287cb8d..2b9084327289 100644
--- a/tests/models/transformers/test_models_transformer_sd3.py
+++ b/tests/models/transformers/test_models_transformer_sd3.py
@@ -76,3 +76,7 @@ def prepare_init_args_and_inputs_for_common(self):
}
inputs_dict = self.dummy_input
return init_dict, inputs_dict
+
+ @unittest.skip("SD3Transformer2DModel uses a dedicated attention processor. This test doesn't apply")
+ def test_set_attn_processor_for_determinism(self):
+ pass
diff --git a/tests/pipelines/animatediff/test_animatediff_controlnet.py b/tests/pipelines/animatediff/test_animatediff_controlnet.py
index 72315bd0c965..3035fc1e3c61 100644
--- a/tests/pipelines/animatediff/test_animatediff_controlnet.py
+++ b/tests/pipelines/animatediff/test_animatediff_controlnet.py
@@ -20,6 +20,7 @@
)
from diffusers.models.attention import FreeNoiseTransformerBlock
from diffusers.utils import logging
+from diffusers.utils.import_utils import is_xformers_available
from diffusers.utils.testing_utils import torch_device
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -329,6 +330,13 @@ def test_prompt_embeds(self):
inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
pipe(**inputs)
+ @unittest.skipIf(
+ torch_device != "cuda" or not is_xformers_available(),
+ reason="XFormers attention is only available with CUDA and `xformers` installed",
+ )
+ def test_xformers_attention_forwardGenerator_pass(self):
+ super()._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
+
def test_free_init(self):
components = self.get_dummy_components()
pipe: AnimateDiffControlNetPipeline = self.pipeline_class(**components)
diff --git a/tests/pipelines/animatediff/test_animatediff_sparsectrl.py b/tests/pipelines/animatediff/test_animatediff_sparsectrl.py
index 5d8a7228118d..e4cc06e1e797 100644
--- a/tests/pipelines/animatediff/test_animatediff_sparsectrl.py
+++ b/tests/pipelines/animatediff/test_animatediff_sparsectrl.py
@@ -19,6 +19,7 @@
UNetMotionModel,
)
from diffusers.utils import logging
+from diffusers.utils.import_utils import is_xformers_available
from diffusers.utils.testing_utils import torch_device
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -393,6 +394,13 @@ def test_prompt_embeds(self):
inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
pipe(**inputs)
+ @unittest.skipIf(
+ torch_device != "cuda" or not is_xformers_available(),
+ reason="XFormers attention is only available with CUDA and `xformers` installed",
+ )
+ def test_xformers_attention_forwardGenerator_pass(self):
+ super()._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
+
def test_free_init(self):
components = self.get_dummy_components()
pipe: AnimateDiffSparseControlNetPipeline = self.pipeline_class(**components)
diff --git a/tests/pipelines/aura_flow/test_pipeline_aura_flow.py b/tests/pipelines/aura_flow/test_pipeline_aura_flow.py
index 3694a733163c..14bc588df905 100644
--- a/tests/pipelines/aura_flow/test_pipeline_aura_flow.py
+++ b/tests/pipelines/aura_flow/test_pipeline_aura_flow.py
@@ -163,3 +163,7 @@ def test_fused_qkv_projections(self):
assert np.allclose(
original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
), "Original outputs should match when fused QKV projections are disabled."
+
+ @unittest.skip("xformers attention processor does not exist for AuraFlow")
+ def test_xformers_attention_forwardGenerator_pass(self):
+ pass
diff --git a/tests/pipelines/cogvideox/test_cogvideox.py b/tests/pipelines/cogvideox/test_cogvideox.py
index 3ae500eb9567..c69dcfda93c5 100644
--- a/tests/pipelines/cogvideox/test_cogvideox.py
+++ b/tests/pipelines/cogvideox/test_cogvideox.py
@@ -30,7 +30,12 @@
)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, to_np
+from ..test_pipelines_common import (
+ PipelineTesterMixin,
+ check_qkv_fusion_matches_attn_procs_length,
+ check_qkv_fusion_processors_exist,
+ to_np,
+)
enable_full_determinism()
@@ -275,6 +280,48 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
"VAE tiling should not affect the inference results",
)
+ @unittest.skip("xformers attention processor does not exist for CogVideoX")
+ def test_xformers_attention_forwardGenerator_pass(self):
+ pass
+
+ def test_fused_qkv_projections(self):
+ device = "cpu" # ensure determinism for the device-dependent torch.Generator
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe = pipe.to(device)
+ pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(device)
+ frames = pipe(**inputs).frames # [B, F, C, H, W]
+ original_image_slice = frames[0, -2:, -1, -3:, -3:]
+
+ pipe.fuse_qkv_projections()
+ assert check_qkv_fusion_processors_exist(
+ pipe.transformer
+ ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+ assert check_qkv_fusion_matches_attn_procs_length(
+ pipe.transformer, pipe.transformer.original_attn_processors
+ ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+ inputs = self.get_dummy_inputs(device)
+ frames = pipe(**inputs).frames
+ image_slice_fused = frames[0, -2:, -1, -3:, -3:]
+
+ pipe.transformer.unfuse_qkv_projections()
+ inputs = self.get_dummy_inputs(device)
+ frames = pipe(**inputs).frames
+ image_slice_disabled = frames[0, -2:, -1, -3:, -3:]
+
+ assert np.allclose(
+ original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3
+ ), "Fusion of QKV projections shouldn't affect the outputs."
+ assert np.allclose(
+ image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3
+ ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+ assert np.allclose(
+ original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+ ), "Original outputs should match when fused QKV projections are disabled."
+
@slow
@require_torch_gpu
diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
index b2744e3f0ad4..57aacd164843 100644
--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -13,10 +13,13 @@
torch_device,
)
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import (
+ PipelineTesterMixin,
+ check_qkv_fusion_matches_attn_procs_length,
+ check_qkv_fusion_processors_exist,
+)
-@unittest.skipIf(torch_device == "mps", "Flux has a float64 operation which is not supported in MPS.")
class FluxPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
pipeline_class = FluxPipeline
params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
@@ -143,6 +146,46 @@ def test_flux_prompt_embeds(self):
max_diff = np.abs(output_with_prompt - output_with_embeds).max()
assert max_diff < 1e-4
+ def test_fused_qkv_projections(self):
+ device = "cpu" # ensure determinism for the device-dependent torch.Generator
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe = pipe.to(device)
+ pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(device)
+ image = pipe(**inputs).images
+ original_image_slice = image[0, -3:, -3:, -1]
+
+ # TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added
+ # to the pipeline level.
+ pipe.transformer.fuse_qkv_projections()
+ assert check_qkv_fusion_processors_exist(
+ pipe.transformer
+ ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+ assert check_qkv_fusion_matches_attn_procs_length(
+ pipe.transformer, pipe.transformer.original_attn_processors
+ ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+ inputs = self.get_dummy_inputs(device)
+ image = pipe(**inputs).images
+ image_slice_fused = image[0, -3:, -3:, -1]
+
+ pipe.transformer.unfuse_qkv_projections()
+ inputs = self.get_dummy_inputs(device)
+ image = pipe(**inputs).images
+ image_slice_disabled = image[0, -3:, -3:, -1]
+
+ assert np.allclose(
+ original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3
+ ), "Fusion of QKV projections shouldn't affect the outputs."
+ assert np.allclose(
+ image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3
+ ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+ assert np.allclose(
+ original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+ ), "Original outputs should match when fused QKV projections are disabled."
+
@slow
@require_torch_gpu
diff --git a/tests/pipelines/latte/test_latte.py b/tests/pipelines/latte/test_latte.py
index 94ff7fc0faf9..9667ebff249d 100644
--- a/tests/pipelines/latte/test_latte.py
+++ b/tests/pipelines/latte/test_latte.py
@@ -28,6 +28,7 @@
LattePipeline,
LatteTransformer3DModel,
)
+from diffusers.utils.import_utils import is_xformers_available
from diffusers.utils.testing_utils import (
enable_full_determinism,
numpy_cosine_similarity_distance,
@@ -256,6 +257,13 @@ def test_save_load_optional_components(self):
max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
self.assertLess(max_diff, 1.0)
+ @unittest.skipIf(
+ torch_device != "cuda" or not is_xformers_available(),
+ reason="XFormers attention is only available with CUDA and `xformers` installed",
+ )
+ def test_xformers_attention_forwardGenerator_pass(self):
+ super()._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
+
@slow
@require_torch_gpu
diff --git a/tests/pipelines/lumina/test_lumina_nextdit.py b/tests/pipelines/lumina/test_lumina_nextdit.py
index a53758ce2808..d6aeb57b80a1 100644
--- a/tests/pipelines/lumina/test_lumina_nextdit.py
+++ b/tests/pipelines/lumina/test_lumina_nextdit.py
@@ -119,6 +119,10 @@ def test_lumina_prompt_embeds(self):
max_diff = np.abs(output_with_prompt - output_with_embeds).max()
assert max_diff < 1e-4
+ @unittest.skip("xformers attention processor does not exist for Lumina")
+ def test_xformers_attention_forwardGenerator_pass(self):
+ pass
+
@slow
@require_torch_gpu
diff --git a/tests/pipelines/pag/test_pag_controlnet_sdxl_img2img.py b/tests/pipelines/pag/test_pag_controlnet_sdxl_img2img.py
new file mode 100644
index 000000000000..b02f4d8b4561
--- /dev/null
+++ b/tests/pipelines/pag/test_pag_controlnet_sdxl_img2img.py
@@ -0,0 +1,271 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import (
+ AutoencoderKL,
+ ControlNetModel,
+ EulerDiscreteScheduler,
+ StableDiffusionXLControlNetImg2ImgPipeline,
+ StableDiffusionXLControlNetPAGImg2ImgPipeline,
+ UNet2DConditionModel,
+)
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor
+
+from ..pipeline_params import (
+ IMAGE_TO_IMAGE_IMAGE_PARAMS,
+ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+ TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
+)
+from ..test_pipelines_common import (
+ IPAdapterTesterMixin,
+ PipelineFromPipeTesterMixin,
+ PipelineLatentTesterMixin,
+ PipelineTesterMixin,
+ SDXLOptionalComponentsTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLControlNetPAGImg2ImgPipelineFastTests(
+ IPAdapterTesterMixin,
+ PipelineLatentTesterMixin,
+ PipelineTesterMixin,
+ PipelineFromPipeTesterMixin,
+ SDXLOptionalComponentsTesterMixin,
+ unittest.TestCase,
+):
+ pipeline_class = StableDiffusionXLControlNetPAGImg2ImgPipeline
+ params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"pag_scale", "pag_adaptive_scale"})
+ batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+ image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+ image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+ callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union(
+ {"add_text_embeds", "add_time_ids", "add_neg_time_ids"}
+ )
+
+ # Copied from tests.pipelines.controlnet.test_controlnet_sdxl_img2img.ControlNetPipelineSDXLImg2ImgFastTests.get_dummy_components
+ def get_dummy_components(self, skip_first_text_encoder=False):
+ torch.manual_seed(0)
+ unet = UNet2DConditionModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ sample_size=32,
+ in_channels=4,
+ out_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+ # SD2-specific config below
+ attention_head_dim=(2, 4),
+ use_linear_projection=True,
+ addition_embed_type="text_time",
+ addition_time_embed_dim=8,
+ transformer_layers_per_block=(1, 2),
+ projection_class_embeddings_input_dim=80, # 6 * 8 + 32
+ cross_attention_dim=64 if not skip_first_text_encoder else 32,
+ )
+ torch.manual_seed(0)
+ controlnet = ControlNetModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ in_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ conditioning_embedding_out_channels=(16, 32),
+ # SD2-specific config below
+ attention_head_dim=(2, 4),
+ use_linear_projection=True,
+ addition_embed_type="text_time",
+ addition_time_embed_dim=8,
+ transformer_layers_per_block=(1, 2),
+ projection_class_embeddings_input_dim=80, # 6 * 8 + 32
+ cross_attention_dim=64,
+ )
+ torch.manual_seed(0)
+ scheduler = EulerDiscreteScheduler(
+ beta_start=0.00085,
+ beta_end=0.012,
+ steps_offset=1,
+ beta_schedule="scaled_linear",
+ timestep_spacing="leading",
+ )
+ torch.manual_seed(0)
+ vae = AutoencoderKL(
+ block_out_channels=[32, 64],
+ in_channels=3,
+ out_channels=3,
+ down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+ up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+ latent_channels=4,
+ )
+ torch.manual_seed(0)
+ text_encoder_config = CLIPTextConfig(
+ bos_token_id=0,
+ eos_token_id=2,
+ hidden_size=32,
+ intermediate_size=37,
+ layer_norm_eps=1e-05,
+ num_attention_heads=4,
+ num_hidden_layers=5,
+ pad_token_id=1,
+ vocab_size=1000,
+ # SD2-specific config below
+ hidden_act="gelu",
+ projection_dim=32,
+ )
+ text_encoder = CLIPTextModel(text_encoder_config)
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+ text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+ tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+ components = {
+ "unet": unet,
+ "controlnet": controlnet,
+ "scheduler": scheduler,
+ "vae": vae,
+ "text_encoder": text_encoder if not skip_first_text_encoder else None,
+ "tokenizer": tokenizer if not skip_first_text_encoder else None,
+ "text_encoder_2": text_encoder_2,
+ "tokenizer_2": tokenizer_2,
+ "image_encoder": None,
+ "feature_extractor": None,
+ }
+ return components
+
+ # based on tests.pipelines.controlnet.test_controlnet_sdxl_img2img.ControlNetPipelineSDXLImg2ImgFastTests.get_dummy_inputs
+ # add `pag_scale` to the inputs
+ def get_dummy_inputs(self, device, seed=0):
+ controlnet_embedder_scale_factor = 2
+ image = floats_tensor(
+ (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+ rng=random.Random(seed),
+ ).to(device)
+
+ if str(device).startswith("mps"):
+ generator = torch.manual_seed(seed)
+ else:
+ generator = torch.Generator(device=device).manual_seed(seed)
+
+ inputs = {
+ "prompt": "A painting of a squirrel eating a burger",
+ "generator": generator,
+ "num_inference_steps": 2,
+ "guidance_scale": 6.0,
+ "pag_scale": 3.0,
+ "output_type": "np",
+ "image": image,
+ "control_image": image,
+ }
+
+ return inputs
+
+ def test_pag_disable_enable(self):
+ device = "cpu" # ensure determinism for the device-dependent torch.Generator
+ components = self.get_dummy_components()
+
+ # base pipeline
+ pipe_sd = StableDiffusionXLControlNetImg2ImgPipeline(**components)
+ pipe_sd = pipe_sd.to(device)
+ pipe_sd.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(device)
+ del inputs["pag_scale"]
+ assert (
+ "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters
+ ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}."
+ out = pipe_sd(**inputs).images[0, -3:, -3:, -1]
+
+ # pag disabled with pag_scale=0.0
+ pipe_pag = self.pipeline_class(**components)
+ pipe_pag = pipe_pag.to(device)
+ pipe_pag.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(device)
+ inputs["pag_scale"] = 0.0
+ out_pag_disabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+ # pag enable
+ pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+ pipe_pag = pipe_pag.to(device)
+ pipe_pag.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(device)
+ out_pag_enabled = pipe_pag(**inputs).images[0, -3:, -3:, -1]
+
+ assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3
+ assert np.abs(out.flatten() - out_pag_enabled.flatten()).max() > 1e-3
+
+ def test_save_load_optional_components(self):
+ pass
+
+ def test_pag_cfg(self):
+ device = "cpu" # ensure determinism for the device-dependent torch.Generator
+ components = self.get_dummy_components()
+
+ pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+ pipe_pag = pipe_pag.to(device)
+ pipe_pag.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(device)
+ image = pipe_pag(**inputs).images
+ image_slice = image[0, -3:, -3:, -1]
+
+ assert image.shape == (
+ 1,
+ 64,
+ 64,
+ 3,
+ ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
+ expected_slice = np.array(
+ [0.5562928, 0.44882968, 0.4588066, 0.63200223, 0.5694165, 0.4955688, 0.6126959, 0.57588536, 0.43827885]
+ )
+
+ max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+ assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
+
+ def test_pag_uncond(self):
+ device = "cpu" # ensure determinism for the device-dependent torch.Generator
+ components = self.get_dummy_components()
+
+ pipe_pag = self.pipeline_class(**components, pag_applied_layers=["mid", "up", "down"])
+ pipe_pag = pipe_pag.to(device)
+ pipe_pag.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(device)
+ inputs["guidance_scale"] = 0.0
+ image = pipe_pag(**inputs).images
+ image_slice = image[0, -3:, -3:, -1]
+
+ assert image.shape == (
+ 1,
+ 64,
+ 64,
+ 3,
+ ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
+ expected_slice = np.array(
+ [0.5543988, 0.45614323, 0.4665692, 0.6202247, 0.5598917, 0.49621183, 0.6084159, 0.5722314, 0.43945464]
+ )
+
+ max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+ assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
index 70a6e444bf13..134175bdaffe 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
@@ -178,6 +178,46 @@ def test_inference(self):
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 1e-3)
+ def test_stable_diffusion_latent_upscaler_negative_prompt(self):
+ device = "cpu" # ensure determinism for the device-dependent torch.Generator
+ components = self.get_dummy_components()
+ sd_pipe = StableDiffusionLatentUpscalePipeline(**components)
+ sd_pipe = sd_pipe.to(device)
+ sd_pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(device)
+ negative_prompt = "french fries"
+ output = sd_pipe(**inputs, negative_prompt=negative_prompt)
+ image = output.images
+ image_slice = image[0, -3:, -3:, -1]
+
+ assert image.shape == (1, 256, 256, 3)
+ expected_slice = np.array(
+ [0.43865365, 0.404124, 0.42618454, 0.44333526, 0.40564927, 0.43818694, 0.4411913, 0.43404633, 0.46392226]
+ )
+
+ assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+ def test_stable_diffusion_latent_upscaler_multiple_init_images(self):
+ device = "cpu" # ensure determinism for the device-dependent torch.Generator
+ components = self.get_dummy_components()
+ sd_pipe = StableDiffusionLatentUpscalePipeline(**components)
+ sd_pipe = sd_pipe.to(device)
+ sd_pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(device)
+ inputs["prompt"] = [inputs["prompt"]] * 2
+ inputs["image"] = inputs["image"].repeat(2, 1, 1, 1)
+ image = sd_pipe(**inputs).images
+ image_slice = image[-1, -3:, -3:, -1]
+
+ assert image.shape == (2, 256, 256, 3)
+ expected_slice = np.array(
+ [0.38730142, 0.35695046, 0.40646142, 0.40967226, 0.3981609, 0.4195988, 0.4248805, 0.430259, 0.45694894]
+ )
+
+ assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
def test_attention_slicing_forward_pass(self):
super().test_attention_slicing_forward_pass(expected_max_diff=7e-3)
diff --git a/tests/pipelines/test_pipeline_utils.py b/tests/pipelines/test_pipeline_utils.py
index 51d987d8bb11..57194acdcf2a 100644
--- a/tests/pipelines/test_pipeline_utils.py
+++ b/tests/pipelines/test_pipeline_utils.py
@@ -68,25 +68,21 @@ def test_all_is_compatible_variant(self):
"unet/diffusion_pytorch_model.fp16.bin",
"unet/diffusion_pytorch_model.fp16.safetensors",
]
- variant = "fp16"
- self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
+ self.assertTrue(is_safetensors_compatible(filenames))
def test_diffusers_model_is_compatible_variant(self):
filenames = [
"unet/diffusion_pytorch_model.fp16.bin",
"unet/diffusion_pytorch_model.fp16.safetensors",
]
- variant = "fp16"
- self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
+ self.assertTrue(is_safetensors_compatible(filenames))
- def test_diffusers_model_is_compatible_variant_partial(self):
- # pass variant but use the non-variant filenames
+ def test_diffusers_model_is_compatible_variant_mixed(self):
filenames = [
"unet/diffusion_pytorch_model.bin",
- "unet/diffusion_pytorch_model.safetensors",
+ "unet/diffusion_pytorch_model.fp16.safetensors",
]
- variant = "fp16"
- self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
+ self.assertTrue(is_safetensors_compatible(filenames))
def test_diffusers_model_is_not_compatible_variant(self):
filenames = [
@@ -99,36 +95,85 @@ def test_diffusers_model_is_not_compatible_variant(self):
"unet/diffusion_pytorch_model.fp16.bin",
# Removed: 'unet/diffusion_pytorch_model.fp16.safetensors',
]
- variant = "fp16"
- self.assertFalse(is_safetensors_compatible(filenames, variant=variant))
+ self.assertFalse(is_safetensors_compatible(filenames))
def test_transformer_model_is_compatible_variant(self):
filenames = [
"text_encoder/pytorch_model.fp16.bin",
"text_encoder/model.fp16.safetensors",
]
- variant = "fp16"
- self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
+ self.assertTrue(is_safetensors_compatible(filenames))
- def test_transformer_model_is_compatible_variant_partial(self):
- # pass variant but use the non-variant filenames
+ def test_transformer_model_is_not_compatible_variant(self):
filenames = [
- "text_encoder/pytorch_model.bin",
- "text_encoder/model.safetensors",
+ "safety_checker/pytorch_model.fp16.bin",
+ "safety_checker/model.fp16.safetensors",
+ "vae/diffusion_pytorch_model.fp16.bin",
+ "vae/diffusion_pytorch_model.fp16.safetensors",
+ "text_encoder/pytorch_model.fp16.bin",
+ "unet/diffusion_pytorch_model.fp16.bin",
+ "unet/diffusion_pytorch_model.fp16.safetensors",
]
- variant = "fp16"
- self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
+ self.assertFalse(is_safetensors_compatible(filenames))
- def test_transformer_model_is_not_compatible_variant(self):
+ def test_transformer_model_is_compatible_variant_extra_folder(self):
+ filenames = [
+ "safety_checker/pytorch_model.fp16.bin",
+ "safety_checker/model.fp16.safetensors",
+ "vae/diffusion_pytorch_model.fp16.bin",
+ "vae/diffusion_pytorch_model.fp16.safetensors",
+ "text_encoder/pytorch_model.fp16.bin",
+ "unet/diffusion_pytorch_model.fp16.bin",
+ "unet/diffusion_pytorch_model.fp16.safetensors",
+ ]
+ self.assertTrue(is_safetensors_compatible(filenames, folder_names={"vae", "unet"}))
+
+ def test_transformer_model_is_not_compatible_variant_extra_folder(self):
filenames = [
"safety_checker/pytorch_model.fp16.bin",
"safety_checker/model.fp16.safetensors",
"vae/diffusion_pytorch_model.fp16.bin",
"vae/diffusion_pytorch_model.fp16.safetensors",
"text_encoder/pytorch_model.fp16.bin",
- # 'text_encoder/model.fp16.safetensors',
"unet/diffusion_pytorch_model.fp16.bin",
"unet/diffusion_pytorch_model.fp16.safetensors",
]
- variant = "fp16"
- self.assertFalse(is_safetensors_compatible(filenames, variant=variant))
+ self.assertFalse(is_safetensors_compatible(filenames, folder_names={"text_encoder"}))
+
+ def test_transformers_is_compatible_sharded(self):
+ filenames = [
+ "text_encoder/pytorch_model.bin",
+ "text_encoder/model-00001-of-00002.safetensors",
+ "text_encoder/model-00002-of-00002.safetensors",
+ ]
+ self.assertTrue(is_safetensors_compatible(filenames))
+
+ def test_transformers_is_compatible_variant_sharded(self):
+ filenames = [
+ "text_encoder/pytorch_model.bin",
+ "text_encoder/model.fp16-00001-of-00002.safetensors",
+ "text_encoder/model.fp16-00001-of-00002.safetensors",
+ ]
+ self.assertTrue(is_safetensors_compatible(filenames))
+
+ def test_diffusers_is_compatible_sharded(self):
+ filenames = [
+ "unet/diffusion_pytorch_model.bin",
+ "unet/diffusion_pytorch_model-00001-of-00002.safetensors",
+ "unet/diffusion_pytorch_model-00002-of-00002.safetensors",
+ ]
+ self.assertTrue(is_safetensors_compatible(filenames))
+
+ def test_diffusers_is_compatible_variant_sharded(self):
+ filenames = [
+ "unet/diffusion_pytorch_model.bin",
+ "unet/diffusion_pytorch_model.fp16-00001-of-00002.safetensors",
+ "unet/diffusion_pytorch_model.fp16-00001-of-00002.safetensors",
+ ]
+ self.assertTrue(is_safetensors_compatible(filenames))
+
+ def test_diffusers_is_compatible_only_variants(self):
+ filenames = [
+ "unet/diffusion_pytorch_model.fp16.safetensors",
+ ]
+ self.assertTrue(is_safetensors_compatible(filenames))
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index 1d37ae1dc2ca..c73a12a4cbf8 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -551,37 +551,94 @@ def test_download_variant_partly(self):
assert sum(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files) == 3
assert not any(f.endswith(other_format) for f in files)
- def test_download_broken_variant(self):
- for use_safetensors in [False, True]:
- # text encoder is missing no variant and "no_ema" variant weights, so the following can't work
- for variant in [None, "no_ema"]:
- with self.assertRaises(OSError) as error_context:
- with tempfile.TemporaryDirectory() as tmpdirname:
- tmpdirname = StableDiffusionPipeline.from_pretrained(
- "hf-internal-testing/stable-diffusion-broken-variants",
- cache_dir=tmpdirname,
- variant=variant,
- use_safetensors=use_safetensors,
- )
-
- assert "Error no file name" in str(error_context.exception)
-
- # text encoder has fp16 variants so we can load it
- with tempfile.TemporaryDirectory() as tmpdirname:
- tmpdirname = StableDiffusionPipeline.download(
+ def test_download_safetensors_only_variant_exists_for_model(self):
+ variant = None
+ use_safetensors = True
+
+ # text encoder is missing no variant weights, so the following can't work
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ with self.assertRaises(OSError) as error_context:
+ tmpdirname = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/stable-diffusion-broken-variants",
+ cache_dir=tmpdirname,
+ variant=variant,
use_safetensors=use_safetensors,
+ )
+ assert "Error no file name" in str(error_context.exception)
+
+ # text encoder has fp16 variants so we can load it
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ tmpdirname = StableDiffusionPipeline.download(
+ "hf-internal-testing/stable-diffusion-broken-variants",
+ use_safetensors=use_safetensors,
+ cache_dir=tmpdirname,
+ variant="fp16",
+ )
+ all_root_files = [t[-1] for t in os.walk(tmpdirname)]
+ files = [item for sublist in all_root_files for item in sublist]
+ # None of the downloaded files should be a non-variant file even if we have some here:
+ # https://huggingface.co/hf-internal-testing/stable-diffusion-broken-variants/tree/main/unet
+ assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
+
+ def test_download_bin_only_variant_exists_for_model(self):
+ variant = None
+ use_safetensors = False
+
+ # text encoder is missing Non-variant weights, so the following can't work
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ with self.assertRaises(OSError) as error_context:
+ tmpdirname = StableDiffusionPipeline.from_pretrained(
+ "hf-internal-testing/stable-diffusion-broken-variants",
cache_dir=tmpdirname,
- variant="fp16",
+ variant=variant,
+ use_safetensors=use_safetensors,
)
+ assert "Error no file name" in str(error_context.exception)
- all_root_files = [t[-1] for t in os.walk(tmpdirname)]
- files = [item for sublist in all_root_files for item in sublist]
+ # text encoder has fp16 variants so we can load it
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ tmpdirname = StableDiffusionPipeline.download(
+ "hf-internal-testing/stable-diffusion-broken-variants",
+ use_safetensors=use_safetensors,
+ cache_dir=tmpdirname,
+ variant="fp16",
+ )
+ all_root_files = [t[-1] for t in os.walk(tmpdirname)]
+ files = [item for sublist in all_root_files for item in sublist]
+ # None of the downloaded files should be a non-variant file even if we have some here:
+ # https://huggingface.co/hf-internal-testing/stable-diffusion-broken-variants/tree/main/unet
+ assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
- # None of the downloaded files should be a non-variant file even if we have some here:
- # https://huggingface.co/hf-internal-testing/stable-diffusion-broken-variants/tree/main/unet
- assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
- # only unet has "no_ema" variant
+ def test_download_safetensors_variant_does_not_exist_for_model(self):
+ variant = "no_ema"
+ use_safetensors = True
+
+ # text encoder is missing no_ema variant weights, so the following can't work
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ with self.assertRaises(OSError) as error_context:
+ tmpdirname = StableDiffusionPipeline.from_pretrained(
+ "hf-internal-testing/stable-diffusion-broken-variants",
+ cache_dir=tmpdirname,
+ variant=variant,
+ use_safetensors=use_safetensors,
+ )
+
+ assert "Error no file name" in str(error_context.exception)
+
+ def test_download_bin_variant_does_not_exist_for_model(self):
+ variant = "no_ema"
+ use_safetensors = False
+
+ # text encoder is missing no_ema variant weights, so the following can't work
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ with self.assertRaises(OSError) as error_context:
+ tmpdirname = StableDiffusionPipeline.from_pretrained(
+ "hf-internal-testing/stable-diffusion-broken-variants",
+ cache_dir=tmpdirname,
+ variant=variant,
+ use_safetensors=use_safetensors,
+ )
+ assert "Error no file name" in str(error_context.exception)
def test_local_save_load_index(self):
prompt = "hello"
diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py
index 79e3a7f9b736..033addd51c3d 100644
--- a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py
+++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py
@@ -20,12 +20,7 @@
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from diffusers import (
- AutoencoderKL,
- DDIMScheduler,
- TextToVideoSDPipeline,
- UNet3DConditionModel,
-)
+from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoSDPipeline, UNet3DConditionModel
from diffusers.utils import is_xformers_available
from diffusers.utils.testing_utils import (
enable_full_determinism,
@@ -64,7 +59,7 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, SDFunctionTesterMixin,
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet3DConditionModel(
- block_out_channels=(4, 8),
+ block_out_channels=(8, 8),
layers_per_block=1,
sample_size=32,
in_channels=4,
@@ -134,10 +129,7 @@ def get_dummy_inputs(self, device, seed=0):
return inputs
def test_dict_tuple_outputs_equivalent(self):
- expected_slice = None
- if torch_device == "cpu":
- expected_slice = np.array([0.4903, 0.5649, 0.5504, 0.5179, 0.4821, 0.5466, 0.4131, 0.5052, 0.5077])
- return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
+ return super().test_dict_tuple_outputs_equivalent()
def test_text_to_video_default_case(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
@@ -151,9 +143,8 @@ def test_text_to_video_default_case(self):
frames = sd_pipe(**inputs).frames
image_slice = frames[0][0][-3:, -3:, -1]
-
assert frames[0][0].shape == (32, 32, 3)
- expected_slice = np.array([0.7537, 0.1752, 0.6157, 0.5508, 0.4240, 0.4110, 0.4838, 0.5648, 0.5094])
+ expected_slice = np.array([0.8093, 0.2751, 0.6976, 0.5927, 0.4616, 0.4336, 0.5094, 0.5683, 0.4796])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2