diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 067c7b613f86..a0517725284e 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -73,7 +73,7 @@ body: - ControlNet @sayakpaul @yiyixuxu @DN6 - T2I Adapter @sayakpaul @yiyixuxu @DN6 - IF @DN6 - - Text-to-Video / Video-to-Video @DN6 @sayakpaul + - Text-to-Video / Video-to-Video @DN6 @a-r-r-o-w - Wuerstchen @DN6 - Other: @yiyixuxu @DN6 - Improving generation quality: @asomoza diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 0ebe8d50b65f..e4b2b45a4ecd 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -49,6 +49,7 @@ Core library: Integrations: - deepspeed: HF Trainer/Accelerate: @SunMarc +- PEFT: @sayakpaul @BenjaminBossan HF projects: diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index d638d1435dfc..a85adfc2bfec 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -19,10 +19,11 @@ jobs: strategy: fail-fast: false max-parallel: 1 - runs-on: [single-gpu, nvidia-gpu, a10, ci] + runs-on: + group: aws-g6-4xlarge-plus container: image: diffusers/diffusers-pytorch-compile-cuda - options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 + options: --shm-size "16gb" --ipc host --gpus 0 steps: - name: Checkout diffusers uses: actions/checkout@v3 diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index 3862cfc7d722..4eab95cd83a5 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -7,7 +7,7 @@ on: env: DIFFUSERS_IS_CI: yes - HF_HOME: /mnt/cache + HF_HUB_ENABLE_HF_TRANSFER: 1 OMP_NUM_THREADS: 8 MKL_NUM_THREADS: 8 PYTEST_TIMEOUT: 600 @@ -18,8 +18,10 @@ env: jobs: setup_torch_cuda_pipeline_matrix: - name: Setup Torch Pipelines Matrix - runs-on: diffusers/diffusers-pytorch-cpu + name: Setup Torch Pipelines CUDA Slow Tests Matrix + runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ] + container: + image: diffusers/diffusers-pytorch-cpu outputs: pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }} steps: @@ -27,10 +29,6 @@ jobs: uses: actions/checkout@v3 with: fetch-depth: 2 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.8" - name: Install dependencies run: | pip install -e . @@ -50,16 +48,17 @@ jobs: path: reports run_nightly_tests_for_torch_pipelines: - name: Torch Pipelines CUDA Nightly Tests + name: Nightly Torch Pipelines CUDA Tests needs: setup_torch_cuda_pipeline_matrix strategy: fail-fast: false + max-parallel: 8 matrix: module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }} runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 + options: --shm-size "16gb" --ipc host --gpus 0 steps: - name: Checkout diffusers uses: actions/checkout@v3 @@ -67,19 +66,16 @@ jobs: fetch-depth: 2 - name: NVIDIA-SMI run: nvidia-smi - - name: Install dependencies run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git python -m uv pip install pytest-reportlog - - name: Environment run: | python utils/print_env.py - - - name: Nightly PyTorch CUDA checkpoint (pipelines) tests + - name: Pipeline CUDA Test env: HF_TOKEN: ${{ secrets.HF_TOKEN }} # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms @@ -90,38 +86,36 @@ jobs: --make-reports=tests_pipeline_${{ matrix.module }}_cuda \ --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \ tests/pipelines/${{ matrix.module }} - - name: Failure short reports if: ${{ failure() }} run: | cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt - - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: pipeline_${{ matrix.module }}_test_reports path: reports - - name: Generate Report and Notify Channel if: always() run: | pip install slack_sdk tabulate - python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY + python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_nightly_tests_for_other_torch_modules: - name: Torch Non-Pipelines CUDA Nightly Tests + name: Nightly Torch CUDA Tests runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 + options: --shm-size "16gb" --ipc host --gpus 0 defaults: run: shell: bash strategy: + max-parallel: 2 matrix: - module: [models, schedulers, others, examples] + module: [models, schedulers, lora, others, single_file, examples] steps: - name: Checkout diffusers uses: actions/checkout@v3 @@ -133,8 +127,8 @@ jobs: python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git + python -m uv pip install peft@git+https://github.com/huggingface/peft.git python -m uv pip install pytest-reportlog - - name: Environment run: python utils/print_env.py @@ -158,7 +152,6 @@ jobs: # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms CUBLAS_WORKSPACE_CONFIG: :16:8 run: | - python -m uv pip install peft@git+https://github.com/huggingface/peft.git python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ -s -v --make-reports=examples_torch_cuda \ --report-log=examples_torch_cuda.log \ @@ -181,64 +174,7 @@ jobs: if: always() run: | pip install slack_sdk tabulate - python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY - - run_lora_nightly_tests: - name: Nightly LoRA Tests with PEFT and TORCH - runs-on: [single-gpu, nvidia-gpu, t4, ci] - container: - image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 - defaults: - run: - shell: bash - steps: - - name: Checkout diffusers - uses: actions/checkout@v3 - with: - fetch-depth: 2 - - - name: Install dependencies - run: | - python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" - python -m uv pip install -e [quality,test] - python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git - python -m uv pip install peft@git+https://github.com/huggingface/peft.git - python -m uv pip install pytest-reportlog - - - name: Environment - run: python utils/print_env.py - - - name: Run nightly LoRA tests with PEFT and Torch - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms - CUBLAS_WORKSPACE_CONFIG: :16:8 - run: | - python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ - -s -v -k "not Flax and not Onnx" \ - --make-reports=tests_torch_lora_cuda \ - --report-log=tests_torch_lora_cuda.log \ - tests/lora - - - name: Failure short reports - if: ${{ failure() }} - run: | - cat reports/tests_torch_lora_cuda_stats.txt - cat reports/tests_torch_lora_cuda_failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v2 - with: - name: torch_lora_cuda_test_reports - path: reports - - - name: Generate Report and Notify Channel - if: always() - run: | - pip install slack_sdk tabulate - python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY + python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_flax_tpu_tests: name: Nightly Flax TPU Tests @@ -294,14 +230,14 @@ jobs: if: always() run: | pip install slack_sdk tabulate - python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY + python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_nightly_onnx_tests: name: Nightly ONNXRuntime CUDA tests on Ubuntu runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-onnxruntime-cuda - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ + options: --gpus 0 --shm-size "16gb" --ipc host steps: - name: Checkout diffusers @@ -318,11 +254,10 @@ jobs: python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git python -m uv pip install pytest-reportlog - - name: Environment run: python utils/print_env.py - - name: Run nightly ONNXRuntime CUDA tests + - name: Run Nightly ONNXRuntime CUDA tests env: HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | @@ -349,7 +284,7 @@ jobs: if: always() run: | pip install slack_sdk tabulate - python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY + python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_nightly_tests_apple_m1: name: Nightly PyTorch MPS tests on MacOS @@ -411,4 +346,4 @@ jobs: if: always() run: | pip install slack_sdk tabulate - python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY + python utils/log_reports.py >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index 19deecb2beae..e12ead72f9ee 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -11,11 +11,9 @@ on: env: DIFFUSERS_IS_CI: yes - HF_HOME: /mnt/cache OMP_NUM_THREADS: 8 MKL_NUM_THREADS: 8 PYTEST_TIMEOUT: 600 - RUN_SLOW: yes PIPELINE_USAGE_CUTOFF: 50000 jobs: @@ -52,7 +50,7 @@ jobs: path: reports torch_pipelines_cuda_tests: - name: Torch Pipelines CUDA Slow Tests + name: Torch Pipelines CUDA Tests needs: setup_torch_cuda_pipeline_matrix strategy: fail-fast: false @@ -62,7 +60,7 @@ jobs: runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 + options: --shm-size "16gb" --ipc host --gpus 0 steps: - name: Checkout diffusers uses: actions/checkout@v3 @@ -106,7 +104,7 @@ jobs: runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 + options: --shm-size "16gb" --ipc host --gpus 0 defaults: run: shell: bash @@ -124,12 +122,13 @@ jobs: python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git + python -m uv pip install peft@git+https://github.com/huggingface/peft.git - name: Environment run: | python utils/print_env.py - - name: Run slow PyTorch CUDA tests + - name: Run PyTorch CUDA tests env: HF_TOKEN: ${{ secrets.HF_TOKEN }} # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms @@ -153,61 +152,6 @@ jobs: name: torch_cuda_test_reports path: reports - peft_cuda_tests: - name: PEFT CUDA Tests - runs-on: [single-gpu, nvidia-gpu, t4, ci] - container: - image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 - defaults: - run: - shell: bash - steps: - - name: Checkout diffusers - uses: actions/checkout@v3 - with: - fetch-depth: 2 - - - name: Install dependencies - run: | - python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" - python -m uv pip install -e [quality,test] - python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git - python -m pip install -U peft@git+https://github.com/huggingface/peft.git - - - name: Environment - run: | - python utils/print_env.py - - - name: Run slow PEFT CUDA tests - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms - CUBLAS_WORKSPACE_CONFIG: :16:8 - run: | - python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ - -s -v -k "not Flax and not Onnx and not PEFTLoRALoading" \ - --make-reports=tests_peft_cuda \ - tests/lora/ - python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ - -s -v -k "lora and not Flax and not Onnx and not PEFTLoRALoading" \ - --make-reports=tests_peft_cuda_models_lora \ - tests/models/ - - - name: Failure short reports - if: ${{ failure() }} - run: | - cat reports/tests_peft_cuda_stats.txt - cat reports/tests_peft_cuda_failures_short.txt - cat reports/tests_peft_cuda_models_lora_failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v2 - with: - name: torch_peft_test_reports - path: reports - flax_tpu_tests: name: Flax TPU Tests runs-on: docker-tpu @@ -309,7 +253,7 @@ jobs: container: image: diffusers/diffusers-pytorch-compile-cuda - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --gpus 0 --shm-size "16gb" --ipc host steps: - name: Checkout diffusers @@ -351,7 +295,7 @@ jobs: container: image: diffusers/diffusers-pytorch-xformers-cuda - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --gpus 0 --shm-size "16gb" --ipc host steps: - name: Checkout diffusers @@ -392,7 +336,7 @@ jobs: container: image: diffusers/diffusers-pytorch-cuda - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --gpus 0 --shm-size "16gb" --ipc host steps: - name: Checkout diffusers diff --git a/.github/workflows/ssh-pr-runner.yml b/.github/workflows/ssh-pr-runner.yml new file mode 100644 index 000000000000..2987f6d15917 --- /dev/null +++ b/.github/workflows/ssh-pr-runner.yml @@ -0,0 +1,39 @@ +name: SSH into PR runners + +on: + workflow_dispatch: + inputs: + docker_image: + description: 'Name of the Docker image' + required: true + +env: + IS_GITHUB_CI: "1" + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + HF_HOME: /mnt/cache + DIFFUSERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes + +jobs: + ssh_runner: + name: "SSH" + runs-on: [self-hosted, intel-cpu, 32-cpu, 256-ram, ci] + container: + image: ${{ github.event.inputs.docker_image }} + options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --privileged + + steps: + - name: Checkout diffusers + uses: actions/checkout@v3 + with: + fetch-depth: 2 + + - name: Tailscale # In order to be able to SSH when a test fails + uses: huggingface/tailscale-action@main + with: + authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }} + slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }} + slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + waitForSSH: true diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml index c24905772c8d..984eb4270544 100644 --- a/.github/workflows/ssh-runner.yml +++ b/.github/workflows/ssh-runner.yml @@ -1,4 +1,4 @@ -name: SSH into runners +name: SSH into GPU runners on: workflow_dispatch: diff --git a/docker/diffusers-onnxruntime-cuda/Dockerfile b/docker/diffusers-onnxruntime-cuda/Dockerfile index 20192175538e..3364698fe945 100644 --- a/docker/diffusers-onnxruntime-cuda/Dockerfile +++ b/docker/diffusers-onnxruntime-cuda/Dockerfile @@ -38,6 +38,7 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ datasets \ hf-doc-builder \ huggingface-hub \ + hf_transfer \ Jinja2 \ librosa \ numpy==1.26.4 \ diff --git a/docker/diffusers-pytorch-compile-cuda/Dockerfile b/docker/diffusers-pytorch-compile-cuda/Dockerfile index cbdfe21f09be..a5454328b851 100644 --- a/docker/diffusers-pytorch-compile-cuda/Dockerfile +++ b/docker/diffusers-pytorch-compile-cuda/Dockerfile @@ -16,28 +16,29 @@ RUN apt install -y bash \ ca-certificates \ libsndfile1-dev \ libgl1 \ - python3.9 \ - python3.9-dev \ + python3.10 \ + python3.10-dev \ python3-pip \ - python3.9-venv && \ + python3.10-venv && \ rm -rf /var/lib/apt/lists # make sure to use venv -RUN python3.9 -m venv /opt/venv +RUN python3.10 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) -RUN python3.9 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ - python3.9 -m uv pip install --no-cache-dir \ +RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ + python3.10 -m uv pip install --no-cache-dir \ torch \ torchvision \ torchaudio \ invisible_watermark && \ - python3.9 -m pip install --no-cache-dir \ + python3.10 -m pip install --no-cache-dir \ accelerate \ datasets \ hf-doc-builder \ huggingface-hub \ + hf_transfer \ Jinja2 \ librosa \ numpy==1.26.4 \ diff --git a/docker/diffusers-pytorch-cpu/Dockerfile b/docker/diffusers-pytorch-cpu/Dockerfile index e2986e0dd166..910765bb0b9c 100644 --- a/docker/diffusers-pytorch-cpu/Dockerfile +++ b/docker/diffusers-pytorch-cpu/Dockerfile @@ -16,6 +16,7 @@ RUN apt install -y bash \ ca-certificates \ libsndfile1-dev \ python3.10 \ + python3.10-dev \ python3-pip \ libgl1 \ python3.10-venv && \ diff --git a/docker/diffusers-pytorch-cuda/Dockerfile b/docker/diffusers-pytorch-cuda/Dockerfile index f672b7536e29..8b5439ffb6c6 100644 --- a/docker/diffusers-pytorch-cuda/Dockerfile +++ b/docker/diffusers-pytorch-cuda/Dockerfile @@ -17,6 +17,7 @@ RUN apt install -y bash \ libsndfile1-dev \ libgl1 \ python3.10 \ + python3.10-dev \ python3-pip \ python3.10-venv && \ rm -rf /var/lib/apt/lists @@ -37,6 +38,7 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ datasets \ hf-doc-builder \ huggingface-hub \ + hf_transfer \ Jinja2 \ librosa \ numpy==1.26.4 \ diff --git a/docker/diffusers-pytorch-xformers-cuda/Dockerfile b/docker/diffusers-pytorch-xformers-cuda/Dockerfile index 998e1a5fd2ff..7a3408c48624 100644 --- a/docker/diffusers-pytorch-xformers-cuda/Dockerfile +++ b/docker/diffusers-pytorch-xformers-cuda/Dockerfile @@ -17,6 +17,7 @@ RUN apt install -y bash \ libsndfile1-dev \ libgl1 \ python3.10 \ + python3.10-dev \ python3-pip \ python3.10-venv && \ rm -rf /var/lib/apt/lists @@ -37,6 +38,7 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ datasets \ hf-doc-builder \ huggingface-hub \ + hf_transfer \ Jinja2 \ librosa \ numpy==1.26.4 \ diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 4ef5740da7d2..81d7a95454d7 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -267,6 +267,8 @@ title: HunyuanDiT2DControlNetModel - local: api/models/controlnet_sd3 title: SD3ControlNetModel + - local: api/models/controlnet_sparsectrl + title: SparseControlNetModel title: Models - isExpanded: false sections: @@ -332,6 +334,8 @@ title: Latent Consistency Models - local: api/pipelines/latent_diffusion title: Latent Diffusion + - local: api/pipelines/latte + title: Latte - local: api/pipelines/ledits_pp title: LEDITS++ - local: api/pipelines/lumina diff --git a/docs/source/en/api/loaders/lora.md b/docs/source/en/api/loaders/lora.md index 3a4d21c6a019..2060a1eefd52 100644 --- a/docs/source/en/api/loaders/lora.md +++ b/docs/source/en/api/loaders/lora.md @@ -12,10 +12,13 @@ specific language governing permissions and limitations under the License. # LoRA -LoRA is a fast and lightweight training method that inserts and trains a significantly smaller number of parameters instead of all the model parameters. This produces a smaller file (~100 MBs) and makes it easier to quickly train a model to learn a new concept. LoRA weights are typically loaded into the UNet, text encoder or both. There are two classes for loading LoRA weights: +LoRA is a fast and lightweight training method that inserts and trains a significantly smaller number of parameters instead of all the model parameters. This produces a smaller file (~100 MBs) and makes it easier to quickly train a model to learn a new concept. LoRA weights are typically loaded into the denoiser, text encoder or both. The denoiser usually corresponds to a UNet ([`UNet2DConditionModel`], for example) or a Transformer ([`SD3Transformer2DModel`], for example). There are several classes for loading LoRA weights: -- [`LoraLoaderMixin`] provides functions for loading and unloading, fusing and unfusing, enabling and disabling, and more functions for managing LoRA weights. This class can be used with any model. -- [`StableDiffusionXLLoraLoaderMixin`] is a [Stable Diffusion (SDXL)](../../api/pipelines/stable_diffusion/stable_diffusion_xl) version of the [`LoraLoaderMixin`] class for loading and saving LoRA weights. It can only be used with the SDXL model. +- [`StableDiffusionLoraLoaderMixin`] provides functions for loading and unloading, fusing and unfusing, enabling and disabling, and more functions for managing LoRA weights. This class can be used with any model. +- [`StableDiffusionXLLoraLoaderMixin`] is a [Stable Diffusion (SDXL)](../../api/pipelines/stable_diffusion/stable_diffusion_xl) version of the [`StableDiffusionLoraLoaderMixin`] class for loading and saving LoRA weights. It can only be used with the SDXL model. +- [`SD3LoraLoaderMixin`] provides similar functions for [Stable Diffusion 3](https://huggingface.co/blog/sd3). +- [`AmusedLoraLoaderMixin`] is for the [`AmusedPipeline`]. +- [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more. @@ -23,10 +26,22 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse -## LoraLoaderMixin +## StableDiffusionLoraLoaderMixin -[[autodoc]] loaders.lora.LoraLoaderMixin +[[autodoc]] loaders.lora_pipeline.StableDiffusionLoraLoaderMixin ## StableDiffusionXLLoraLoaderMixin -[[autodoc]] loaders.lora.StableDiffusionXLLoraLoaderMixin \ No newline at end of file +[[autodoc]] loaders.lora_pipeline.StableDiffusionXLLoraLoaderMixin + +## SD3LoraLoaderMixin + +[[autodoc]] loaders.lora_pipeline.SD3LoraLoaderMixin + +## AmusedLoraLoaderMixin + +[[autodoc]] loaders.lora_pipeline.AmusedLoraLoaderMixin + +## LoraBaseMixin + +[[autodoc]] loaders.lora_base.LoraBaseMixin \ No newline at end of file diff --git a/docs/source/en/api/loaders/peft.md b/docs/source/en/api/loaders/peft.md index ecb82c41e754..67a4a7f2a490 100644 --- a/docs/source/en/api/loaders/peft.md +++ b/docs/source/en/api/loaders/peft.md @@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License. # PEFT -Diffusers supports loading adapters such as [LoRA](../../using-diffusers/loading_adapters) with the [PEFT](https://huggingface.co/docs/peft/index) library with the [`~loaders.peft.PeftAdapterMixin`] class. This allows modeling classes in Diffusers like [`UNet2DConditionModel`] to load an adapter. +Diffusers supports loading adapters such as [LoRA](../../using-diffusers/loading_adapters) with the [PEFT](https://huggingface.co/docs/peft/index) library with the [`~loaders.peft.PeftAdapterMixin`] class. This allows modeling classes in Diffusers like [`UNet2DConditionModel`], [`SD3Transformer2DModel`] to operate with an adapter. diff --git a/docs/source/en/api/loaders/unet.md b/docs/source/en/api/loaders/unet.md index d8cfab64221b..16cc319b4ed0 100644 --- a/docs/source/en/api/loaders/unet.md +++ b/docs/source/en/api/loaders/unet.md @@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License. # UNet -Some training methods - like LoRA and Custom Diffusion - typically target the UNet's attention layers, but these training methods can also target other non-attention layers. Instead of training all of a model's parameters, only a subset of the parameters are trained, which is faster and more efficient. This class is useful if you're *only* loading weights into a UNet. If you need to load weights into the text encoder or a text encoder and UNet, try using the [`~loaders.LoraLoaderMixin.load_lora_weights`] function instead. +Some training methods - like LoRA and Custom Diffusion - typically target the UNet's attention layers, but these training methods can also target other non-attention layers. Instead of training all of a model's parameters, only a subset of the parameters are trained, which is faster and more efficient. This class is useful if you're *only* loading weights into a UNet. If you need to load weights into the text encoder or a text encoder and UNet, try using the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] function instead. The [`UNet2DConditionLoadersMixin`] class provides functions for loading and saving weights, fusing and unfusing LoRAs, disabling and enabling LoRAs, and setting and deleting adapters. diff --git a/docs/source/en/api/models/controlnet_sparsectrl.md b/docs/source/en/api/models/controlnet_sparsectrl.md new file mode 100644 index 000000000000..d5d7d358c4d2 --- /dev/null +++ b/docs/source/en/api/models/controlnet_sparsectrl.md @@ -0,0 +1,46 @@ + + +# SparseControlNetModel + +SparseControlNetModel is an implementation of ControlNet for [AnimateDiff](https://arxiv.org/abs/2307.04725). + +ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala. + +The SparseCtrl version of ControlNet was introduced in [SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933) for achieving controlled generation in text-to-video diffusion models by Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai. + +The abstract from the paper is: + +*The development of text-to-video (T2V), i.e., generating videos with a given text prompt, has been significantly advanced in recent years. However, relying solely on text prompts often results in ambiguous frame composition due to spatial uncertainty. The research community thus leverages the dense structure signals, e.g., per-frame depth/edge sequences, to enhance controllability, whose collection accordingly increases the burden of inference. In this work, we present SparseCtrl to enable flexible structure control with temporally sparse signals, requiring only one or a few inputs, as shown in Figure 1. It incorporates an additional condition encoder to process these sparse signals while leaving the pre-trained T2V model untouched. The proposed approach is compatible with various modalities, including sketches, depth maps, and RGB images, providing more practical control for video generation and promoting applications such as storyboarding, depth rendering, keyframe animation, and interpolation. Extensive experiments demonstrate the generalization of SparseCtrl on both original and personalized T2V generators. Codes and models will be publicly available at [this https URL](https://guoyww.github.io/projects/SparseCtrl).* + +## Example for loading SparseControlNetModel + +```python +import torch +from diffusers import SparseControlNetModel + +# fp32 variant in float16 +# 1. Scribble checkpoint +controlnet = SparseControlNetModel.from_pretrained("guoyww/animatediff-sparsectrl-scribble", torch_dtype=torch.float16) + +# 2. RGB checkpoint +controlnet = SparseControlNetModel.from_pretrained("guoyww/animatediff-sparsectrl-rgb", torch_dtype=torch.float16) + +# For loading fp16 variant, pass `variant="fp16"` as an additional parameter +``` + +## SparseControlNetModel + +[[autodoc]] SparseControlNetModel + +## SparseControlNetOutput + +[[autodoc]] models.controlnet_sparsectrl.SparseControlNetOutput diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md index be18054aed71..835b4d7b7fdd 100644 --- a/docs/source/en/api/pipelines/animatediff.md +++ b/docs/source/en/api/pipelines/animatediff.md @@ -100,6 +100,189 @@ AnimateDiff tends to work better with finetuned Stable Diffusion models. If you +### AnimateDiffSparseControlNetPipeline + +[SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933) for achieving controlled generation in text-to-video diffusion models by Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai. + +The abstract from the paper is: + +*The development of text-to-video (T2V), i.e., generating videos with a given text prompt, has been significantly advanced in recent years. However, relying solely on text prompts often results in ambiguous frame composition due to spatial uncertainty. The research community thus leverages the dense structure signals, e.g., per-frame depth/edge sequences, to enhance controllability, whose collection accordingly increases the burden of inference. In this work, we present SparseCtrl to enable flexible structure control with temporally sparse signals, requiring only one or a few inputs, as shown in Figure 1. It incorporates an additional condition encoder to process these sparse signals while leaving the pre-trained T2V model untouched. The proposed approach is compatible with various modalities, including sketches, depth maps, and RGB images, providing more practical control for video generation and promoting applications such as storyboarding, depth rendering, keyframe animation, and interpolation. Extensive experiments demonstrate the generalization of SparseCtrl on both original and personalized T2V generators. Codes and models will be publicly available at [this https URL](https://guoyww.github.io/projects/SparseCtrl).* + +SparseCtrl introduces the following checkpoints for controlled text-to-video generation: + +- [SparseCtrl Scribble](https://huggingface.co/guoyww/animatediff-sparsectrl-scribble) +- [SparseCtrl RGB](https://huggingface.co/guoyww/animatediff-sparsectrl-rgb) + +#### Using SparseCtrl Scribble + +```python +import torch + +from diffusers import AnimateDiffSparseControlNetPipeline +from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel +from diffusers.schedulers import DPMSolverMultistepScheduler +from diffusers.utils import export_to_gif, load_image + + +model_id = "SG161222/Realistic_Vision_V5.1_noVAE" +motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3" +controlnet_id = "guoyww/animatediff-sparsectrl-scribble" +lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3" +vae_id = "stabilityai/sd-vae-ft-mse" +device = "cuda" + +motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device) +controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device) +vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device) +scheduler = DPMSolverMultistepScheduler.from_pretrained( + model_id, + subfolder="scheduler", + beta_schedule="linear", + algorithm_type="dpmsolver++", + use_karras_sigmas=True, +) +pipe = AnimateDiffSparseControlNetPipeline.from_pretrained( + model_id, + motion_adapter=motion_adapter, + controlnet=controlnet, + vae=vae, + scheduler=scheduler, + torch_dtype=torch.float16, +).to(device) +pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora") +pipe.fuse_lora(lora_scale=1.0) + +prompt = "an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality" +negative_prompt = "low quality, worst quality, letterboxed" + +image_files = [ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-1.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-2.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-3.png" +] +condition_frame_indices = [0, 8, 15] +conditioning_frames = [load_image(img_file) for img_file in image_files] + +video = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + num_inference_steps=25, + conditioning_frames=conditioning_frames, + controlnet_conditioning_scale=1.0, + controlnet_frame_indices=condition_frame_indices, + generator=torch.Generator().manual_seed(1337), +).frames[0] +export_to_gif(video, "output.gif") +``` + +Here are some sample outputs: + + + +
+ an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality +
+ + + + + + + + + +
+
+ scribble-1 +
+
+
+ scribble-2 +
+
+
+ scribble-3 +
+
+
+ an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality +
+
+ +#### Using SparseCtrl RGB + +```python +import torch + +from diffusers import AnimateDiffSparseControlNetPipeline +from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel +from diffusers.schedulers import DPMSolverMultistepScheduler +from diffusers.utils import export_to_gif, load_image + + +model_id = "SG161222/Realistic_Vision_V5.1_noVAE" +motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3" +controlnet_id = "guoyww/animatediff-sparsectrl-rgb" +lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3" +vae_id = "stabilityai/sd-vae-ft-mse" +device = "cuda" + +motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device) +controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device) +vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device) +scheduler = DPMSolverMultistepScheduler.from_pretrained( + model_id, + subfolder="scheduler", + beta_schedule="linear", + algorithm_type="dpmsolver++", + use_karras_sigmas=True, +) +pipe = AnimateDiffSparseControlNetPipeline.from_pretrained( + model_id, + motion_adapter=motion_adapter, + controlnet=controlnet, + vae=vae, + scheduler=scheduler, + torch_dtype=torch.float16, +).to(device) +pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora") + +image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-firework.png") + +video = pipe( + prompt="closeup face photo of man in black clothes, night city street, bokeh, fireworks in background", + negative_prompt="low quality, worst quality", + num_inference_steps=25, + conditioning_frames=image, + controlnet_frame_indices=[0], + controlnet_conditioning_scale=1.0, + generator=torch.Generator().manual_seed(42), +).frames[0] +export_to_gif(video, "output.gif") +``` + +Here are some sample outputs: + + + +
+ closeup face photo of man in black clothes, night city street, bokeh, fireworks in background +
+ + + + + +
+
+ closeup face photo of man in black clothes, night city street, bokeh, fireworks in background +
+
+
+ closeup face photo of man in black clothes, night city street, bokeh, fireworks in background +
+
+ ### AnimateDiffSDXLPipeline AnimateDiff can also be used with SDXL models. This is currently an experimental feature as only a beta release of the motion adapter checkpoint is available. @@ -571,7 +754,6 @@ ckpt_path = "https://huggingface.co/Lightricks/LongAnimateDiff/blob/main/lt_long adapter = MotionAdapter.from_single_file(ckpt_path, torch_dtype=torch.float16) pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter) - ``` ## AnimateDiffPipeline @@ -580,6 +762,12 @@ pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapt - all - __call__ +## AnimateDiffSparseControlNetPipeline + +[[autodoc]] AnimateDiffSparseControlNetPipeline + - all + - __call__ + ## AnimateDiffSDXLPipeline [[autodoc]] AnimateDiffSDXLPipeline diff --git a/docs/source/en/api/pipelines/kolors.md b/docs/source/en/api/pipelines/kolors.md index 1c083e4285bc..a35a29d8a061 100644 --- a/docs/source/en/api/pipelines/kolors.md +++ b/docs/source/en/api/pipelines/kolors.md @@ -41,6 +41,64 @@ image = pipe( image.save("kolors_sample.png") ``` +### IP Adapter + +Kolors needs a different IP Adapter to work, and it uses [Openai-CLIP-336](https://huggingface.co/openai/clip-vit-large-patch14-336) as an image encoder. + + + +Using an IP Adapter with Kolors requires more than 24GB of VRAM. To use it, we recommend using [`~DiffusionPipeline.enable_model_cpu_offload`] on consumer GPUs. + + + + + +While Kolors is integrated in Diffusers, you need to load the image encoder from a revision to use the safetensor files. You can still use the main branch of the original repository if you're comfortable loading pickle checkpoints. + + + +```python +import torch +from transformers import CLIPVisionModelWithProjection + +from diffusers import DPMSolverMultistepScheduler, KolorsPipeline +from diffusers.utils import load_image + +image_encoder = CLIPVisionModelWithProjection.from_pretrained( + "Kwai-Kolors/Kolors-IP-Adapter-Plus", + subfolder="image_encoder", + low_cpu_mem_usage=True, + torch_dtype=torch.float16, + revision="refs/pr/4", +) + +pipe = KolorsPipeline.from_pretrained( + "Kwai-Kolors/Kolors-diffusers", image_encoder=image_encoder, torch_dtype=torch.float16, variant="fp16" +).to("cuda") +pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, use_karras_sigmas=True) + +pipe.load_ip_adapter( + "Kwai-Kolors/Kolors-IP-Adapter-Plus", + subfolder="", + weight_name="ip_adapter_plus_general.safetensors", + revision="refs/pr/4", + image_encoder_folder=None, +) +pipe.enable_model_cpu_offload() + +ipa_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/kolors/cat_square.png") + +image = pipe( + prompt="best quality, high quality", + negative_prompt="", + guidance_scale=6.5, + num_inference_steps=25, + ip_adapter_image=ipa_image, +).images[0] + +image.save("kolors_ipa_sample.png") +``` + ## KolorsPipeline [[autodoc]] KolorsPipeline diff --git a/docs/source/en/api/pipelines/latte.md b/docs/source/en/api/pipelines/latte.md new file mode 100644 index 000000000000..2572e11e152d --- /dev/null +++ b/docs/source/en/api/pipelines/latte.md @@ -0,0 +1,75 @@ + + +# Latte + +![latte text-to-video](https://github.com/Vchitect/Latte/blob/52bc0029899babbd6e9250384c83d8ed2670ff7a/visuals/latte.gif?raw=true) + +[Latte: Latent Diffusion Transformer for Video Generation](https://arxiv.org/abs/2401.03048) from Monash University, Shanghai AI Lab, Nanjing University, and Nanyang Technological University. + +The abstract from the paper is: + +*We propose a novel Latent Diffusion Transformer, namely Latte, for video generation. Latte first extracts spatio-temporal tokens from input videos and then adopts a series of Transformer blocks to model video distribution in the latent space. In order to model a substantial number of tokens extracted from videos, four efficient variants are introduced from the perspective of decomposing the spatial and temporal dimensions of input videos. To improve the quality of generated videos, we determine the best practices of Latte through rigorous experimental analysis, including video clip patch embedding, model variants, timestep-class information injection, temporal positional embedding, and learning strategies. Our comprehensive evaluation demonstrates that Latte achieves state-of-the-art performance across four standard video generation datasets, i.e., FaceForensics, SkyTimelapse, UCF101, and Taichi-HD. In addition, we extend Latte to text-to-video generation (T2V) task, where Latte achieves comparable results compared to recent T2V models. We strongly believe that Latte provides valuable insights for future research on incorporating Transformers into diffusion models for video generation.* + +**Highlights**: Latte is a latent diffusion transformer proposed as a backbone for modeling different modalities (trained for text-to-video generation here). It achieves state-of-the-art performance across four standard video benchmarks - [FaceForensics](https://arxiv.org/abs/1803.09179), [SkyTimelapse](https://arxiv.org/abs/1709.07592), [UCF101](https://arxiv.org/abs/1212.0402) and [Taichi-HD](https://arxiv.org/abs/2003.00196). To prepare and download the datasets for evaluation, please refer to [this https URL](https://github.com/Vchitect/Latte/blob/main/docs/datasets_evaluation.md). + + + +Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines. + + + +### Inference + +Use [`torch.compile`](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile) to reduce the inference latency. + +First, load the pipeline: + +```python +import torch +from diffusers import LattePipeline + +pipeline = LattePipeline.from_pretrained( + "maxin-cn/Latte-1", torch_dtype=torch.float16 +).to("cuda") +``` + +Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`: + +```python +pipeline.transformer.to(memory_format=torch.channels_last) +pipeline.vae.to(memory_format=torch.channels_last) +``` + +Finally, compile the components and run inference: + +```python +pipeline.transformer = torch.compile(pipeline.transformer) +pipeline.vae.decode = torch.compile(pipeline.vae.decode) + +video = pipeline(prompt="A dog wearing sunglasses floating in space, surreal, nebulae in background").frames[0] +``` + +The [benchmark](https://gist.github.com/a-r-r-o-w/4e1694ca46374793c0361d740a99ff19) results on an 80GB A100 machine are: + +``` +Without torch.compile(): Average inference time: 16.246 seconds. +With torch.compile(): Average inference time: 14.573 seconds. +``` + +## LattePipeline + +[[autodoc]] LattePipeline + - all + - __call__ diff --git a/docs/source/en/tutorials/basic_training.md b/docs/source/en/tutorials/basic_training.md index 4f7c29cfe513..402c8c59b17d 100644 --- a/docs/source/en/tutorials/basic_training.md +++ b/docs/source/en/tutorials/basic_training.md @@ -340,7 +340,8 @@ Now you can wrap all these components together in a training loop with 🤗 Acce ... loss = F.mse_loss(noise_pred, noise) ... accelerator.backward(loss) -... accelerator.clip_grad_norm_(model.parameters(), 1.0) +... if accelerator.sync_gradients: +... accelerator.clip_grad_norm_(model.parameters(), 1.0) ... optimizer.step() ... lr_scheduler.step() ... optimizer.zero_grad() diff --git a/docs/source/en/tutorials/using_peft_for_inference.md b/docs/source/en/tutorials/using_peft_for_inference.md index 1bfb3f5c48b7..c37dd90fa172 100644 --- a/docs/source/en/tutorials/using_peft_for_inference.md +++ b/docs/source/en/tutorials/using_peft_for_inference.md @@ -191,7 +191,7 @@ image ## Manage active adapters -You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.LoraLoaderMixin.get_active_adapters`] method to check the list of active adapters: +You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.StableDiffusionLoraLoaderMixin.get_active_adapters`] method to check the list of active adapters: ```py active_adapters = pipe.get_active_adapters() @@ -199,7 +199,7 @@ active_adapters ["toy", "pixel"] ``` -You can also get the active adapters of each pipeline component with [`~diffusers.loaders.LoraLoaderMixin.get_list_adapters`]: +You can also get the active adapters of each pipeline component with [`~diffusers.loaders.StableDiffusionLoraLoaderMixin.get_list_adapters`]: ```py list_adapters_component_wise = pipe.get_list_adapters() diff --git a/docs/source/en/using-diffusers/inference_with_lcm.md b/docs/source/en/using-diffusers/inference_with_lcm.md index ff436a655fe6..20cae677791e 100644 --- a/docs/source/en/using-diffusers/inference_with_lcm.md +++ b/docs/source/en/using-diffusers/inference_with_lcm.md @@ -64,7 +64,7 @@ image -To use LCM-LoRAs, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps. +To use LCM-LoRAs, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps. A couple of notes to keep in mind when using LCM-LoRAs are: @@ -156,7 +156,7 @@ image -To use LCM-LoRAs for image-to-image, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps. +To use LCM-LoRAs for image-to-image, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps. > [!TIP] > Experiment with different values for `num_inference_steps`, `strength`, and `guidance_scale` to get the best results. @@ -207,7 +207,7 @@ image ## Inpainting -To use LCM-LoRAs for inpainting, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt, initial image, and mask image to generate an image in just 4 steps. +To use LCM-LoRAs for inpainting, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt, initial image, and mask image to generate an image in just 4 steps. ```py import torch @@ -262,7 +262,7 @@ LCMs are compatible with adapters like LoRA, ControlNet, T2I-Adapter, and Animat -Load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LoRA weights into the LCM and generate a styled image in a few steps. +Load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method to load the LoRA weights into the LCM and generate a styled image in a few steps. ```python from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler @@ -294,7 +294,7 @@ image -Replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights and the style LoRA you want to use. Combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method and generate a styled image in a few steps. +Replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights and the style LoRA you want to use. Combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method and generate a styled image in a few steps. ```py import torch @@ -389,7 +389,7 @@ make_image_grid([canny_image, image], rows=1, cols=2) -Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a Stable Diffusion v1.5 model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights, and pass the canny image to the pipeline and generate an image. +Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a Stable Diffusion v1.5 model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Use the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights, and pass the canny image to the pipeline and generate an image. > [!TIP] > Experiment with different values for `num_inference_steps`, `controlnet_conditioning_scale`, `cross_attention_kwargs`, and `guidance_scale` to get the best results. @@ -525,7 +525,7 @@ image = pipe( -Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Replace the scheduler with the [`LCMScheduler`], and use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights. Pass the canny image to the pipeline and generate an image. +Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Replace the scheduler with the [`LCMScheduler`], and use the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights. Pass the canny image to the pipeline and generate an image. ```py import torch diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md index a3523d3c3d41..9616cf0be412 100644 --- a/docs/source/en/using-diffusers/loading_adapters.md +++ b/docs/source/en/using-diffusers/loading_adapters.md @@ -116,7 +116,7 @@ import torch pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda") ``` -Then use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the [ostris/super-cereal-sdxl-lora](https://huggingface.co/ostris/super-cereal-sdxl-lora) weights and specify the weights filename from the repository: +Then use the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method to load the [ostris/super-cereal-sdxl-lora](https://huggingface.co/ostris/super-cereal-sdxl-lora) weights and specify the weights filename from the repository: ```py pipeline.load_lora_weights("ostris/super-cereal-sdxl-lora", weight_name="cereal_box_sdxl_v1.safetensors") @@ -129,7 +129,7 @@ image -The [`~loaders.LoraLoaderMixin.load_lora_weights`] method loads LoRA weights into both the UNet and text encoder. It is the preferred way for loading LoRAs because it can handle cases where: +The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method loads LoRA weights into both the UNet and text encoder. It is the preferred way for loading LoRAs because it can handle cases where: - the LoRA weights don't have separate identifiers for the UNet and text encoder - the LoRA weights have separate identifiers for the UNet and text encoder @@ -153,7 +153,7 @@ image -To unload the LoRA weights, use the [`~loaders.LoraLoaderMixin.unload_lora_weights`] method to discard the LoRA weights and restore the model to its original weights: +To unload the LoRA weights, use the [`~loaders.StableDiffusionLoraLoaderMixin.unload_lora_weights`] method to discard the LoRA weights and restore the model to its original weights: ```py pipeline.unload_lora_weights() @@ -161,9 +161,9 @@ pipeline.unload_lora_weights() ### Adjust LoRA weight scale -For both [`~loaders.LoraLoaderMixin.load_lora_weights`] and [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`], you can pass the `cross_attention_kwargs={"scale": 0.5}` parameter to adjust how much of the LoRA weights to use. A value of `0` is the same as only using the base model weights, and a value of `1` is equivalent to using the fully finetuned LoRA. +For both [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] and [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`], you can pass the `cross_attention_kwargs={"scale": 0.5}` parameter to adjust how much of the LoRA weights to use. A value of `0` is the same as only using the base model weights, and a value of `1` is equivalent to using the fully finetuned LoRA. -For more granular control on the amount of LoRA weights used per layer, you can use [`~loaders.LoraLoaderMixin.set_adapters`] and pass a dictionary specifying by how much to scale the weights in each layer by. +For more granular control on the amount of LoRA weights used per layer, you can use [`~loaders.StableDiffusionLoraLoaderMixin.set_adapters`] and pass a dictionary specifying by how much to scale the weights in each layer by. ```python pipe = ... # create pipeline pipe.load_lora_weights(..., adapter_name="my_adapter") @@ -186,7 +186,7 @@ This also works with multiple adapters - see [this guide](https://huggingface.co -Currently, [`~loaders.LoraLoaderMixin.set_adapters`] only supports scaling attention weights. If a LoRA has other parts (e.g., resnets or down-/upsamplers), they will keep a scale of 1.0. +Currently, [`~loaders.StableDiffusionLoraLoaderMixin.set_adapters`] only supports scaling attention weights. If a LoRA has other parts (e.g., resnets or down-/upsamplers), they will keep a scale of 1.0. @@ -203,7 +203,7 @@ To load a Kohya LoRA, let's download the [Blueprintify SD XL 1.0](https://civita !wget https://civitai.com/api/download/models/168776 -O blueprintify-sd-xl-10.safetensors ``` -Load the LoRA checkpoint with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method, and specify the filename in the `weight_name` parameter: +Load the LoRA checkpoint with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method, and specify the filename in the `weight_name` parameter: ```py from diffusers import AutoPipelineForText2Image @@ -227,7 +227,7 @@ image Some limitations of using Kohya LoRAs with 🤗 Diffusers include: - Images may not look like those generated by UIs - like ComfyUI - for multiple reasons, which are explained [here](https://github.com/huggingface/diffusers/pull/4287/#issuecomment-1655110736). -- [LyCORIS checkpoints](https://github.com/KohakuBlueleaf/LyCORIS) aren't fully supported. The [`~loaders.LoraLoaderMixin.load_lora_weights`] method loads LyCORIS checkpoints with LoRA and LoCon modules, but Hada and LoKR are not supported. +- [LyCORIS checkpoints](https://github.com/KohakuBlueleaf/LyCORIS) aren't fully supported. The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method loads LyCORIS checkpoints with LoRA and LoCon modules, but Hada and LoKR are not supported. diff --git a/docs/source/en/using-diffusers/merge_loras.md b/docs/source/en/using-diffusers/merge_loras.md index 8b533b80c217..c52b81330b3c 100644 --- a/docs/source/en/using-diffusers/merge_loras.md +++ b/docs/source/en/using-diffusers/merge_loras.md @@ -14,9 +14,9 @@ specific language governing permissions and limitations under the License. It can be fun and creative to use multiple [LoRAs]((https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora)) together to generate something entirely new and unique. This works by merging multiple LoRA weights together to produce images that are a blend of different styles. Diffusers provides a few methods to merge LoRAs depending on *how* you want to merge their weights, which can affect image quality. -This guide will show you how to merge LoRAs using the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] and [`~peft.LoraModel.add_weighted_adapter`] methods. To improve inference speed and reduce memory-usage of merged LoRAs, you'll also see how to use the [`~loaders.LoraLoaderMixin.fuse_lora`] method to fuse the LoRA weights with the original weights of the underlying model. +This guide will show you how to merge LoRAs using the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] and [`~peft.LoraModel.add_weighted_adapter`] methods. To improve inference speed and reduce memory-usage of merged LoRAs, you'll also see how to use the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method to fuse the LoRA weights with the original weights of the underlying model. -For this guide, load a Stable Diffusion XL (SDXL) checkpoint and the [KappaNeuro/studio-ghibli-style]() and [Norod78/sdxl-chalkboarddrawing-lora]() LoRAs with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. You'll need to assign each LoRA an `adapter_name` to combine them later. +For this guide, load a Stable Diffusion XL (SDXL) checkpoint and the [KappaNeuro/studio-ghibli-style]() and [Norod78/sdxl-chalkboarddrawing-lora]() LoRAs with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method. You'll need to assign each LoRA an `adapter_name` to combine them later. ```py from diffusers import DiffusionPipeline @@ -182,9 +182,9 @@ image ## fuse_lora -Both the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] and [`~peft.LoraModel.add_weighted_adapter`] methods require loading the base model and the LoRA adapters separately which incurs some overhead. The [`~loaders.LoraLoaderMixin.fuse_lora`] method allows you to fuse the LoRA weights directly with the original weights of the underlying model. This way, you're only loading the model once which can increase inference and lower memory-usage. +Both the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] and [`~peft.LoraModel.add_weighted_adapter`] methods require loading the base model and the LoRA adapters separately which incurs some overhead. The [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method allows you to fuse the LoRA weights directly with the original weights of the underlying model. This way, you're only loading the model once which can increase inference and lower memory-usage. -You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~loaders.LoraLoaderMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage. +You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage. For example, if you have a base model and adapters loaded and set as active with the following adapter weights: @@ -199,13 +199,13 @@ pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_ pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8]) ``` -Fuse these LoRAs into the UNet with the [`~loaders.LoraLoaderMixin.fuse_lora`] method. The `lora_scale` parameter controls how much to scale the output by with the LoRA weights. It is important to make the `lora_scale` adjustments in the [`~loaders.LoraLoaderMixin.fuse_lora`] method because it won’t work if you try to pass `scale` to the `cross_attention_kwargs` in the pipeline. +Fuse these LoRAs into the UNet with the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method. The `lora_scale` parameter controls how much to scale the output by with the LoRA weights. It is important to make the `lora_scale` adjustments in the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method because it won’t work if you try to pass `scale` to the `cross_attention_kwargs` in the pipeline. ```py pipeline.fuse_lora(adapter_names=["ikea", "feng"], lora_scale=1.0) ``` -Then you should use [`~loaders.LoraLoaderMixin.unload_lora_weights`] to unload the LoRA weights since they've already been fused with the underlying base model. Finally, call [`~DiffusionPipeline.save_pretrained`] to save the fused pipeline locally or you could call [`~DiffusionPipeline.push_to_hub`] to push the fused pipeline to the Hub. +Then you should use [`~loaders.StableDiffusionLoraLoaderMixin.unload_lora_weights`] to unload the LoRA weights since they've already been fused with the underlying base model. Finally, call [`~DiffusionPipeline.save_pretrained`] to save the fused pipeline locally or you could call [`~DiffusionPipeline.push_to_hub`] to push the fused pipeline to the Hub. ```py pipeline.unload_lora_weights() @@ -226,7 +226,7 @@ image = pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai" image ``` -You can call [`~loaders.LoraLoaderMixin.unfuse_lora`] to restore the original model's weights (for example, if you want to use a different `lora_scale` value). However, this only works if you've only fused one LoRA adapter to the original model. If you've fused multiple LoRAs, you'll need to reload the model. +You can call [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] to restore the original model's weights (for example, if you want to use a different `lora_scale` value). However, this only works if you've only fused one LoRA adapter to the original model. If you've fused multiple LoRAs, you'll need to reload the model. ```py pipeline.unfuse_lora() diff --git a/docs/source/en/using-diffusers/other-formats.md b/docs/source/en/using-diffusers/other-formats.md index 6acd736b5f34..59ce3c5c80a4 100644 --- a/docs/source/en/using-diffusers/other-formats.md +++ b/docs/source/en/using-diffusers/other-formats.md @@ -74,7 +74,7 @@ pipeline = StableDiffusionPipeline.from_single_file( [LoRA](https://hf.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a lightweight adapter that is fast and easy to train, making them especially popular for generating images in a certain way or style. These adapters are commonly stored in a safetensors file, and are widely popular on model sharing platforms like [civitai](https://civitai.com/). -LoRAs are loaded into a base model with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. +LoRAs are loaded into a base model with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method. ```py from diffusers import StableDiffusionXLPipeline diff --git a/docs/source/en/using-diffusers/pag.md b/docs/source/en/using-diffusers/pag.md index c9f950bf0b23..f6ca87ef0662 100644 --- a/docs/source/en/using-diffusers/pag.md +++ b/docs/source/en/using-diffusers/pag.md @@ -44,10 +44,10 @@ pipeline.enable_model_cpu_offload() > [!TIP] > The `pag_applied_layers` argument allows you to specify which layers PAG is applied to. Additionally, you can use `set_pag_applied_layers` method to update these layers after the pipeline has been created. Check out the [pag_applied_layers](#pag_applied_layers) section to learn more about applying PAG to other layers. -If you already have a pipeline created and loaded, you can enable PAG on it using the `from_pipe` API with the `enable_pag` flag. Internally, a PAG pipeline is created based on the pipeline and task you specified. In the example below, since we used `AutoPipelineForText2Image` and passed a `StableDiffusionXLPipeline`, a `StableDiffusionXLPAGPipeline` is created accordingly. Note that this does not require additional memory, and you will have both `StableDiffusionXLPipeline` and `StableDiffusionXLPAGPipeline` loaded and ready to use. You can read more about the `from_pipe` API and how to reuse pipelines in diffuser[here](https://huggingface.co/docs/diffusers/using-diffusers/loading#reuse-a-pipeline) +If you already have a pipeline created and loaded, you can enable PAG on it using the `from_pipe` API with the `enable_pag` flag. Internally, a PAG pipeline is created based on the pipeline and task you specified. In the example below, since we used `AutoPipelineForText2Image` and passed a `StableDiffusionXLPipeline`, a `StableDiffusionXLPAGPipeline` is created accordingly. Note that this does not require additional memory, and you will have both `StableDiffusionXLPipeline` and `StableDiffusionXLPAGPipeline` loaded and ready to use. You can read more about the `from_pipe` API and how to reuse pipelines in diffuser [here](https://huggingface.co/docs/diffusers/using-diffusers/loading#reuse-a-pipeline). ```py -pipeline_sdxl = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0, torch_dtype=torch.float16") +pipeline_sdxl = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16) pipeline = AutoPipelineForText2Image.from_pipe(pipeline_sdxl, enable_pag=True) ``` diff --git a/docs/source/en/using-diffusers/shap-e.md b/docs/source/en/using-diffusers/shap-e.md index 588dde97c98c..5c80ff88d247 100644 --- a/docs/source/en/using-diffusers/shap-e.md +++ b/docs/source/en/using-diffusers/shap-e.md @@ -52,7 +52,7 @@ images = pipe( ).images ``` -Now use the [`~utils.export_to_gif`] function to turn the list of image frames into a gif of the 3D object. +이제 [`~utils.export_to_gif`] 함수를 사용해 이미지 프레임 리스트를 3D 오브젝트의 gif로 변환합니다. ```py from diffusers.utils import export_to_gif diff --git a/docs/source/en/using-diffusers/svd.md b/docs/source/en/using-diffusers/svd.md index cbb74d7b1026..7852d81fa209 100644 --- a/docs/source/en/using-diffusers/svd.md +++ b/docs/source/en/using-diffusers/svd.md @@ -21,6 +21,7 @@ This guide will show you how to use SVD to generate short videos from images. Before you begin, make sure you have the following libraries installed: ```py +# Colab에서 필요한 라이브러리를 설치하기 위해 주석을 제외하세요 !pip install -q -U diffusers transformers accelerate ``` diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 016e04158925..05504cbadfd0 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -1,121 +1,183 @@ - sections: - local: index - title: "🧨 Diffusers" + title: 🧨 Diffusers - local: quicktour title: "훑어보기" - local: stable_diffusion title: Stable Diffusion - local: installation - title: "설치" - title: "시작하기" + title: 설치 + title: 시작하기 - sections: - local: tutorials/tutorial_overview title: 개요 - local: using-diffusers/write_own_pipeline title: 모델과 스케줄러 이해하기 - - local: in_translation - title: AutoPipeline + - local: in_translation # tutorials/autopipeline + title: (번역중) AutoPipeline - local: tutorials/basic_training title: Diffusion 모델 학습하기 - title: Tutorials + - local: in_translation # tutorials/using_peft_for_inference + title: (번역중) 추론을 위한 LoRAs 불러오기 + - local: in_translation # tutorials/fast_diffusion + title: (번역중) Text-to-image diffusion 모델 추론 가속화하기 + - local: in_translation # tutorials/inference_with_big_models + title: (번역중) 큰 모델로 작업하기 + title: 튜토리얼 - sections: - - sections: - - local: using-diffusers/loading_overview - title: 개요 - - local: using-diffusers/loading - title: 파이프라인, 모델, 스케줄러 불러오기 - - local: using-diffusers/schedulers - title: 다른 스케줄러들을 가져오고 비교하기 - - local: using-diffusers/custom_pipeline_overview - title: 커뮤니티 파이프라인 불러오기 - - local: using-diffusers/using_safetensors - title: 세이프텐서 불러오기 - - local: using-diffusers/other-formats - title: 다른 형식의 Stable Diffusion 불러오기 - - local: in_translation - title: Hub에 파일 push하기 - title: 불러오기 & 허브 - - sections: - - local: using-diffusers/pipeline_overview - title: 개요 - - local: using-diffusers/unconditional_image_generation - title: Unconditional 이미지 생성 - - local: using-diffusers/conditional_image_generation - title: Text-to-image 생성 - - local: using-diffusers/img2img - title: Text-guided image-to-image - - local: using-diffusers/inpaint - title: Text-guided 이미지 인페인팅 - - local: using-diffusers/depth2img - title: Text-guided depth-to-image - - local: using-diffusers/textual_inversion_inference - title: Textual inversion - - local: training/distributed_inference - title: 여러 GPU를 사용한 분산 추론 - - local: in_translation - title: Distilled Stable Diffusion 추론 - - local: using-diffusers/reusing_seeds - title: Deterministic 생성으로 이미지 퀄리티 높이기 - - local: using-diffusers/control_brightness - title: 이미지 밝기 조정하기 - - local: using-diffusers/reproducibility - title: 재현 가능한 파이프라인 생성하기 - - local: using-diffusers/custom_pipeline_examples - title: 커뮤니티 파이프라인들 - - local: using-diffusers/contribute_pipeline - title: 커뮤티니 파이프라인에 기여하는 방법 - - local: using-diffusers/stable_diffusion_jax_how_to - title: JAX/Flax에서의 Stable Diffusion - - local: using-diffusers/weighted_prompts - title: Weighting Prompts - title: 추론을 위한 파이프라인 - - sections: - - local: training/overview - title: 개요 - - local: training/create_dataset - title: 학습을 위한 데이터셋 생성하기 - - local: training/adapt_a_model - title: 새로운 태스크에 모델 적용하기 + - local: using-diffusers/loading + title: 파이프라인 불러오기 + - local: using-diffusers/custom_pipeline_overview + title: 커뮤니티 파이프라인과 컴포넌트 불러오기 + - local: using-diffusers/schedulers + title: 스케줄러와 모델 불러오기 + - local: using-diffusers/other-formats + title: 모델 파일과 레이아웃 + - local: using-diffusers/loading_adapters + title: 어댑터 불러오기 + - local: using-diffusers/push_to_hub + title: 파일들을 Hub로 푸시하기 + title: 파이프라인과 어댑터 불러오기 +- sections: + - local: using-diffusers/unconditional_image_generation + title: Unconditional 이미지 생성 + - local: using-diffusers/conditional_image_generation + title: Text-to-image + - local: using-diffusers/img2img + title: Image-to-image + - local: using-diffusers/inpaint + title: 인페인팅 + - local: in_translation # using-diffusers/text-img2vid + title: (번역중) Text 또는 image-to-video + - local: using-diffusers/depth2img + title: Depth-to-image + title: 생성 태스크 +- sections: + - local: in_translation # using-diffusers/overview_techniques + title: (번역중) 개요 + - local: training/distributed_inference + title: 여러 GPU를 사용한 분산 추론 + - local: in_translation # using-diffusers/merge_loras + title: (번역중) LoRA 병합 + - local: in_translation # using-diffusers/scheduler_features + title: (번역중) 스케줄러 기능 + - local: in_translation # using-diffusers/callback + title: (번역중) 파이프라인 콜백 + - local: in_translation # using-diffusers/reusing_seeds + title: (번역중) 재현 가능한 파이프라인 + - local: in_translation # using-diffusers/image_quality + title: (번역중) 이미지 퀄리티 조절하기 + - local: using-diffusers/weighted_prompts + title: 프롬프트 기술 + title: 추론 테크닉 +- sections: + - local: in_translation # advanced_inference/outpaint + title: (번역중) Outpainting + title: 추론 심화 +- sections: + - local: in_translation # using-diffusers/sdxl + title: (번역중) Stable Diffusion XL + - local: using-diffusers/sdxl_turbo + title: SDXL Turbo + - local: using-diffusers/kandinsky + title: Kandinsky + - local: in_translation # using-diffusers/ip_adapter + title: (번역중) IP-Adapter + - local: in_translation # using-diffusers/pag + title: (번역중) PAG + - local: in_translation # using-diffusers/controlnet + title: (번역중) ControlNet + - local: in_translation # using-diffusers/t2i_adapter + title: (번역중) T2I-Adapter + - local: in_translation # using-diffusers/inference_with_lcm + title: (번역중) Latent Consistency Model + - local: using-diffusers/textual_inversion_inference + title: Textual inversion + - local: using-diffusers/shap-e + title: Shap-E + - local: using-diffusers/diffedit + title: DiffEdit + - local: in_translation # using-diffusers/inference_with_tcd_lora + title: (번역중) Trajectory Consistency Distillation-LoRA + - local: using-diffusers/svd + title: Stable Video Diffusion + - local: in_translation # using-diffusers/marigold_usage + title: (번역중) Marigold 컴퓨터 비전 + title: 특정 파이프라인 예시 +- sections: + - local: training/overview + title: 개요 + - local: training/create_dataset + title: 학습을 위한 데이터셋 생성하기 + - local: training/adapt_a_model + title: 새로운 태스크에 모델 적용하기 + - isExpanded: false + sections: - local: training/unconditional_training title: Unconditional 이미지 생성 - - local: training/text_inversion - title: Textual Inversion - - local: training/dreambooth - title: DreamBooth - local: training/text2image title: Text-to-image - - local: training/lora - title: Low-Rank Adaptation of Large Language Models (LoRA) + - local: in_translation # training/sdxl + title: (번역중) Stable Diffusion XL + - local: in_translation # training/kandinsky + title: (번역중) Kandinsky 2.2 + - local: in_translation # training/wuerstchen + title: (번역중) Wuerstchen - local: training/controlnet title: ControlNet + - local: in_translation # training/t2i_adapters + title: (번역중) T2I-Adapters - local: training/instructpix2pix - title: InstructPix2Pix 학습 + title: InstructPix2Pix + title: 모델 + - isExpanded: false + sections: + - local: training/text_inversion + title: Textual Inversion + - local: training/dreambooth + title: DreamBooth + - local: training/lora + title: LoRA - local: training/custom_diffusion title: Custom Diffusion - title: Training - title: Diffusers 사용하기 + - local: in_translation # training/lcm_distill + title: (번역중) Latent Consistency Distillation + - local: in_translation # training/ddpo + title: (번역중) DDPO 강화학습 훈련 + title: 메서드 + title: 학습 - sections: - - local: optimization/opt_overview - title: 개요 - local: optimization/fp16 - title: 메모리와 속도 + title: 추론 스피드업 + - local: in_translation # optimization/memory + title: (번역중) 메모리 사용량 줄이기 - local: optimization/torch2.0 - title: Torch2.0 지원 + title: PyTorch 2.0 - local: optimization/xformers title: xFormers - - local: optimization/onnx - title: ONNX - - local: optimization/open_vino - title: OpenVINO - - local: optimization/coreml - title: Core ML - - local: optimization/mps - title: MPS - - local: optimization/habana - title: Habana Gaudi - local: optimization/tome - title: Token Merging - title: 최적화/특수 하드웨어 + title: Token merging + - local: in_translation # optimization/deepcache + title: (번역중) DeepCache + - local: in_translation # optimization/tgate + title: (번역중) TGATE + - sections: + - local: using-diffusers/stable_diffusion_jax_how_to + title: JAX/Flax + - local: optimization/onnx + title: ONNX + - local: optimization/open_vino + title: OpenVINO + - local: optimization/coreml + title: Core ML + title: 최적화된 모델 형식 + - sections: + - local: optimization/mps + title: Metal Performance Shaders (MPS) + - local: optimization/habana + title: Habana Gaudi + title: 최적화된 하드웨어 + title: 추론 가속화와 메모리 줄이기 - sections: - local: conceptual/philosophy title: 철학 diff --git a/docs/source/ko/conceptual/ethical_guidelines.md b/docs/source/ko/conceptual/ethical_guidelines.md index d76fde03268c..5b78525fdbda 100644 --- a/docs/source/ko/conceptual/ethical_guidelines.md +++ b/docs/source/ko/conceptual/ethical_guidelines.md @@ -10,26 +10,27 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# 🧨 Diffusers의 윤리 지침 +# 🧨 Diffusers의 윤리 지침 [[-diffusers-ethical-guidelines]] -## 서문 +## 서문 [[preamble]] [Diffusers](https://huggingface.co/docs/diffusers/index)는 사전 훈련된 diffusion 모델을 제공하며 추론 및 훈련을 위한 모듈식 툴박스로 사용됩니다. 이 기술의 실제 적용과 사회에 미칠 수 있는 부정적인 영향을 고려하여 Diffusers 라이브러리의 개발, 사용자 기여 및 사용에 윤리 지침을 제공하는 것이 중요하다고 생각합니다. -이 기술을 사용하는 데 연관된 위험은 아직 조사 중이지만, 몇 가지 예를 들면: 예술가들에 대한 저작권 문제; 딥 페이크의 악용; 부적절한 맥락에서의 성적 콘텐츠 생성; 동의 없는 impersonation; 사회적인 편견으로 인해 억압되는 그룹들에 대한 해로운 영향입니다. +이이 기술을 사용함에 따른 위험은 여전히 검토 중이지만, 몇 가지 예를 들면: 예술가들에 대한 저작권 문제; 딥 페이크의 악용; 부적절한 맥락에서의 성적 콘텐츠 생성; 동의 없는 사칭; 소수자 집단의 억압을 영속화하는 유해한 사회적 편견 등이 있습니다. + 우리는 위험을 지속적으로 추적하고 커뮤니티의 응답과 소중한 피드백에 따라 다음 지침을 조정할 것입니다. -## 범위 +## 범위 [[scope]] Diffusers 커뮤니티는 프로젝트의 개발에 다음과 같은 윤리 지침을 적용하며, 특히 윤리적 문제와 관련된 민감한 주제에 대한 커뮤니티의 기여를 조정하는 데 도움을 줄 것입니다. -## 윤리 지침 +## 윤리 지침 [[ethical-guidelines]] -다음 윤리 지침은 일반적으로 적용되지만, 기술적 선택을 할 때 윤리적으로 민감한 문제를 다룰 때 주로 적용할 것입니다. 또한, 해당 기술의 최신 동향과 관련된 신규 위험에 따라 시간이 지남에 따라 이러한 윤리 원칙을 조정할 것을 약속합니다. +다음 윤리 지침은 일반적으로 적용되지만, 민감한 윤리적 문제와 관련하여 기술적 선택을 할 때 이를 우선적으로 적용할 것입니다. 나아가, 해당 기술의 최신 동향과 관련된 새로운 위험이 발생함에 따라 이러한 윤리 원칙을 조정할 것을 약속드립니다. - **투명성**: 우리는 PR을 관리하고, 사용자에게 우리의 선택을 설명하며, 기술적 의사결정을 내릴 때 투명성을 유지할 것을 약속합니다. @@ -44,7 +45,7 @@ Diffusers 커뮤니티는 프로젝트의 개발에 다음과 같은 윤리 지 - **책임**: 우리는 커뮤니티와 팀워크를 통해, 이 기술의 잠재적인 위험과 위험을 예측하고 완화하는 데 대한 공동 책임을 가지고 있습니다. -## 구현 사례: 안전 기능과 메커니즘 +## 구현 사례: 안전 기능과 메커니즘 [[examples-of-implementations-safety-features-and-mechanisms]] 팀은 diffusion 기술과 관련된 잠재적인 윤리 및 사회적 위험에 대처하기 위한 기술적 및 비기술적 도구를 제공하고자 하고 있습니다. 또한, 커뮤니티의 참여는 이러한 기능의 구현하고 우리와 함께 인식을 높이는 데 매우 중요합니다. diff --git a/docs/source/ko/index.md b/docs/source/ko/index.md index f406a54bb882..d2c791e7ef3b 100644 --- a/docs/source/ko/index.md +++ b/docs/source/ko/index.md @@ -46,52 +46,4 @@ specific language governing permissions and limitations under the License.

🤗 Diffusers 클래스 및 메서드의 작동 방식에 대한 기술 설명.

- - -## Supported pipelines - -| Pipeline | Paper/Repository | Tasks | -|---|---|:---:| -| [alt_diffusion](./api/pipelines/alt_diffusion) | [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation | -| [audio_diffusion](./api/pipelines/audio_diffusion) | [Audio Diffusion](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation | -| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [Adding Conditional Control to Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation | -| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [Unifying Diffusion Models' Latent Space, with Applications to CycleDiffusion and Guidance](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation | -| [dance_diffusion](./api/pipelines/dance_diffusion) | [Dance Diffusion](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation | -| [ddpm](./api/pipelines/ddpm) | [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation | -| [ddim](./api/pipelines/ddim) | [Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation | -| [if](./if) | [**IF**](./api/pipelines/if) | Image Generation | -| [if_img2img](./if) | [**IF**](./api/pipelines/if) | Image-to-Image Generation | -| [if_inpainting](./if) | [**IF**](./api/pipelines/if) | Image-to-Image Generation | -| [latent_diffusion](./api/pipelines/latent_diffusion) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | -| [latent_diffusion](./api/pipelines/latent_diffusion) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | -| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | -| [paint_by_example](./api/pipelines/paint_by_example) | [Paint by Example: Exemplar-based Image Editing with Diffusion Models](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting | -| [pndm](./api/pipelines/pndm) | [Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | -| [score_sde_ve](./api/pipelines/score_sde_ve) | [Score-Based Generative Modeling through Stochastic Differential Equations](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | -| [score_sde_vp](./api/pipelines/score_sde_vp) | [Score-Based Generative Modeling through Stochastic Differential Equations](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | -| [semantic_stable_diffusion](./api/pipelines/semantic_stable_diffusion) | [Semantic Guidance](https://arxiv.org/abs/2301.12247) | Text-Guided Generation | -| [stable_diffusion_text2img](./api/pipelines/stable_diffusion/text2img) | [Stable Diffusion](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | -| [stable_diffusion_img2img](./api/pipelines/stable_diffusion/img2img) | [Stable Diffusion](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | -| [stable_diffusion_inpaint](./api/pipelines/stable_diffusion/inpaint) | [Stable Diffusion](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | -| [stable_diffusion_panorama](./api/pipelines/stable_diffusion/panorama) | [MultiDiffusion](https://multidiffusion.github.io/) | Text-to-Panorama Generation | -| [stable_diffusion_pix2pix](./api/pipelines/stable_diffusion/pix2pix) | [InstructPix2Pix: Learning to Follow Image Editing Instructions](https://arxiv.org/abs/2211.09800) | Text-Guided Image Editing| -| [stable_diffusion_pix2pix_zero](./api/pipelines/stable_diffusion/pix2pix_zero) | [Zero-shot Image-to-Image Translation](https://pix2pixzero.github.io/) | Text-Guided Image Editing | -| [stable_diffusion_attend_and_excite](./api/pipelines/stable_diffusion/attend_and_excite) | [Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models](https://arxiv.org/abs/2301.13826) | Text-to-Image Generation | -| [stable_diffusion_self_attention_guidance](./api/pipelines/stable_diffusion/self_attention_guidance) | [Improving Sample Quality of Diffusion Models Using Self-Attention Guidance](https://arxiv.org/abs/2210.00939) | Text-to-Image Generation Unconditional Image Generation | -| [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [Stable Diffusion Image Variations](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation | -| [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [Stable Diffusion Latent Upscaler](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image | -| [stable_diffusion_model_editing](./api/pipelines/stable_diffusion/model_editing) | [Editing Implicit Assumptions in Text-to-Image Diffusion Models](https://time-diffusion.github.io/) | Text-to-Image Model Editing | -| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Stable Diffusion 2](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation | -| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Stable Diffusion 2](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | -| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Depth-Conditional Stable Diffusion](https://github.com/Stability-AI/stablediffusion#depth-conditional-stable-diffusion) | Depth-to-Image Generation | -| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Stable Diffusion 2](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image | -| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [Safe Stable Diffusion](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | -| [stable_unclip](./stable_unclip) | Stable unCLIP | Text-to-Image Generation | -| [stable_unclip](./stable_unclip) | Stable unCLIP | Image-to-Image Text-Guided Generation | -| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | -| [text_to_video_sd](./api/pipelines/text_to_video) | [Modelscope's Text-to-video-synthesis Model in Open Domain](https://modelscope.cn/models/damo/text-to-video-synthesis/summary) | Text-to-Video Generation | -| [unclip](./api/pipelines/unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125)(implementation by [kakaobrain](https://github.com/kakaobrain/karlo)) | Text-to-Image Generation | -| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | -| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation | -| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | -| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | + \ No newline at end of file diff --git a/docs/source/ko/optimization/opt_overview.md b/docs/source/ko/optimization/opt_overview.md deleted file mode 100644 index 40b1cabeb6f3..000000000000 --- a/docs/source/ko/optimization/opt_overview.md +++ /dev/null @@ -1,17 +0,0 @@ - - -# 개요 - -노이즈가 많은 출력에서 적은 출력으로 만드는 과정으로 고품질 생성 모델의 출력을 만드는 각각의 반복되는 스텝은 많은 계산이 필요합니다. 🧨 Diffuser의 목표 중 하나는 모든 사람이 이 기술을 널리 이용할 수 있도록 하는 것이며, 여기에는 소비자 및 특수 하드웨어에서 빠른 추론을 가능하게 하는 것을 포함합니다. - -이 섹션에서는 추론 속도를 최적화하고 메모리 소비를 줄이기 위한 반정밀(half-precision) 가중치 및 sliced attention과 같은 팁과 요령을 다룹니다. 또한 [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) 또는 [ONNX Runtime](https://onnxruntime.ai/docs/)을 사용하여 PyTorch 코드의 속도를 높이고, [xFormers](https://facebookresearch.github.io/xformers/)를 사용하여 memory-efficient attention을 활성화하는 방법을 배울 수 있습니다. Apple Silicon, Intel 또는 Habana 프로세서와 같은 특정 하드웨어에서 추론을 실행하기 위한 가이드도 있습니다. \ No newline at end of file diff --git a/docs/source/ko/quicktour.md b/docs/source/ko/quicktour.md index 1b5d0443a515..e30c80dbf5b3 100644 --- a/docs/source/ko/quicktour.md +++ b/docs/source/ko/quicktour.md @@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License. Diffusion 모델은 이미지나 오디오와 같은 관심 샘플들을 생성하기 위해 랜덤 가우시안 노이즈를 단계별로 제거하도록 학습됩니다. 이로 인해 생성 AI에 대한 관심이 매우 높아졌으며, 인터넷에서 diffusion 생성 이미지의 예를 본 적이 있을 것입니다. 🧨 Diffusers는 누구나 diffusion 모델들을 널리 이용할 수 있도록 하기 위한 라이브러리입니다. -개발자든 일반 사용자든 이 훑어보기를 통해 🧨 diffusers를 소개하고 빠르게 생성할 수 있도록 도와드립니다! 알아야 할 라이브러리의 주요 구성 요소는 크게 세 가지입니다: +개발자든 일반 사용자든 이 훑어보기를 통해 🧨 Diffusers를 소개하고 빠르게 생성할 수 있도록 도와드립니다! 알아야 할 라이브러리의 주요 구성 요소는 크게 세 가지입니다: * [`DiffusionPipeline`]은 추론을 위해 사전 학습된 diffusion 모델에서 샘플을 빠르게 생성하도록 설계된 높은 수준의 엔드투엔드 클래스입니다. * Diffusion 시스템 생성을 위한 빌딩 블록으로 사용할 수 있는 널리 사용되는 사전 학습된 [model](./api/models) 아키텍처 및 모듈. diff --git a/docs/source/ko/using-diffusers/contribute_pipeline.md b/docs/source/ko/using-diffusers/contribute_pipeline.md deleted file mode 100644 index 36d27e23e392..000000000000 --- a/docs/source/ko/using-diffusers/contribute_pipeline.md +++ /dev/null @@ -1,182 +0,0 @@ - - -# 커뮤니티 파이프라인에 기여하는 방법 - - - -💡 모든 사람이 속도 저하 없이 쉽게 작업을 공유할 수 있도록 커뮤니티 파이프라인을 추가하는 이유에 대한 자세한 내용은 GitHub 이슈 [#841](https://github.com/huggingface/diffusers/issues/841)를 참조하세요. - - - -커뮤니티 파이프라인을 사용하면 [`DiffusionPipeline`] 위에 원하는 추가 기능을 추가할 수 있습니다. `DiffusionPipeline` 위에 구축할 때의 가장 큰 장점은 누구나 인수를 하나만 추가하면 파이프라인을 로드하고 사용할 수 있어 커뮤니티가 매우 쉽게 접근할 수 있다는 것입니다. - -이번 가이드에서는 커뮤니티 파이프라인을 생성하는 방법과 작동 원리를 설명합니다. -간단하게 설명하기 위해 `UNet`이 단일 forward pass를 수행하고 스케줄러를 한 번 호출하는 "one-step" 파이프라인을 만들겠습니다. - -## 파이프라인 초기화 - -커뮤니티 파이프라인을 위한 `one_step_unet.py` 파일을 생성하는 것으로 시작합니다. 이 파일에서, Hub에서 모델 가중치와 스케줄러 구성을 로드할 수 있도록 [`DiffusionPipeline`]을 상속하는 파이프라인 클래스를 생성합니다. one-step 파이프라인에는 `UNet`과 스케줄러가 필요하므로 이를 `__init__` 함수에 인수로 추가해야합니다: - -```python -from diffusers import DiffusionPipeline -import torch - - -class UnetSchedulerOneForwardPipeline(DiffusionPipeline): - def __init__(self, unet, scheduler): - super().__init__() -``` - -파이프라인과 그 구성요소(`unet` and `scheduler`)를 [`~DiffusionPipeline.save_pretrained`]으로 저장할 수 있도록 하려면 `register_modules` 함수에 추가하세요: - -```diff - from diffusers import DiffusionPipeline - import torch - - class UnetSchedulerOneForwardPipeline(DiffusionPipeline): - def __init__(self, unet, scheduler): - super().__init__() - -+ self.register_modules(unet=unet, scheduler=scheduler) -``` - -이제 '초기화' 단계가 완료되었으니 forward pass로 이동할 수 있습니다! 🔥 - -## Forward pass 정의 - -Forward pass 에서는(`__call__`로 정의하는 것이 좋습니다) 원하는 기능을 추가할 수 있는 완전한 창작 자유가 있습니다. 우리의 놀라운 one-step 파이프라인의 경우, 임의의 이미지를 생성하고 `timestep=1`을 설정하여 `unet`과 `scheduler`를 한 번만 호출합니다: - -```diff - from diffusers import DiffusionPipeline - import torch - - - class UnetSchedulerOneForwardPipeline(DiffusionPipeline): - def __init__(self, unet, scheduler): - super().__init__() - - self.register_modules(unet=unet, scheduler=scheduler) - -+ def __call__(self): -+ image = torch.randn( -+ (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size), -+ ) -+ timestep = 1 - -+ model_output = self.unet(image, timestep).sample -+ scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample - -+ return scheduler_output -``` - -끝났습니다! 🚀 이제 이 파이프라인에 `unet`과 `scheduler`를 전달하여 실행할 수 있습니다: - -```python -from diffusers import DDPMScheduler, UNet2DModel - -scheduler = DDPMScheduler() -unet = UNet2DModel() - -pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler) - -output = pipeline() -``` - -하지만 파이프라인 구조가 동일한 경우 기존 가중치를 파이프라인에 로드할 수 있다는 장점이 있습니다. 예를 들어 one-step 파이프라인에 [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32) 가중치를 로드할 수 있습니다: - -```python -pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32") - -output = pipeline() -``` - -## 파이프라인 공유 - -🧨Diffusers [리포지토리](https://github.com/huggingface/diffusers)에서 Pull Request를 열어 [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) 하위 폴더에 `one_step_unet.py`의 멋진 파이프라인을 추가하세요. - -병합이 되면, `diffusers >= 0.4.0`이 설치된 사용자라면 누구나 `custom_pipeline` 인수에 지정하여 이 파이프라인을 마술처럼 🪄 사용할 수 있습니다: - -```python -from diffusers import DiffusionPipeline - -pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet") -pipe() -``` - -커뮤니티 파이프라인을 공유하는 또 다른 방법은 Hub 에서 선호하는 [모델 리포지토리](https://huggingface.co/docs/hub/models-uploading)에 직접 `one_step_unet.py` 파일을 업로드하는 것입니다. `one_step_unet.py` 파일을 지정하는 대신 모델 저장소 id를 `custom_pipeline` 인수에 전달하세요: - -```python -from diffusers import DiffusionPipeline - -pipeline = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="stevhliu/one_step_unet") -``` - -다음 표에서 두 가지 공유 워크플로우를 비교하여 자신에게 가장 적합한 옵션을 결정하는 데 도움이 되는 정보를 확인하세요: - -| | GitHub 커뮤니티 파이프라인 | HF Hub 커뮤니티 파이프라인 | -|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------| -| 사용법 | 동일 | 동일 | -| 리뷰 과정 | 병합하기 전에 GitHub에서 Pull Request를 열고 Diffusers 팀의 검토 과정을 거칩니다. 속도가 느릴 수 있습니다. | 검토 없이 Hub 저장소에 바로 업로드합니다. 가장 빠른 워크플로우 입니다. | -| 가시성 | 공식 Diffusers 저장소 및 문서에 포함되어 있습니다. | HF 허브 프로필에 포함되며 가시성을 확보하기 위해 자신의 사용량/프로모션에 의존합니다. | - - - -💡 커뮤니티 파이프라인 파일에 원하는 패키지를 사용할 수 있습니다. 사용자가 패키지를 설치하기만 하면 모든 것이 정상적으로 작동합니다. 파이프라인이 자동으로 감지되므로 `DiffusionPipeline`에서 상속하는 파이프라인 클래스가 하나만 있는지 확인하세요. - - - -## 커뮤니티 파이프라인은 어떻게 작동하나요? - -커뮤니티 파이프라인은 [`DiffusionPipeline`]을 상속하는 클래스입니다: - -- [`custom_pipeline`] 인수로 로드할 수 있습니다. -- 모델 가중치 및 스케줄러 구성은 [`pretrained_model_name_or_path`]에서 로드됩니다. -- 커뮤니티 파이프라인에서 기능을 구현하는 코드는 `pipeline.py` 파일에 정의되어 있습니다. - -공식 저장소에서 모든 파이프라인 구성 요소 가중치를 로드할 수 없는 경우가 있습니다. 이 경우 다른 구성 요소는 파이프라인에 직접 전달해야 합니다: - -```python -from diffusers import DiffusionPipeline -from transformers import CLIPFeatureExtractor, CLIPModel - -model_id = "CompVis/stable-diffusion-v1-4" -clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" - -feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_model_id) -clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16) - -pipeline = DiffusionPipeline.from_pretrained( - model_id, - custom_pipeline="clip_guided_stable_diffusion", - clip_model=clip_model, - feature_extractor=feature_extractor, - scheduler=scheduler, - torch_dtype=torch.float16, -) -``` - -커뮤니티 파이프라인의 마법은 다음 코드에 담겨 있습니다. 이 코드를 통해 커뮤니티 파이프라인을 GitHub 또는 Hub에서 로드할 수 있으며, 모든 🧨 Diffusers 패키지에서 사용할 수 있습니다. - -```python -# 2. 파이프라인 클래스를 로드합니다. 사용자 지정 모듈을 사용하는 경우 Hub에서 로드합니다 -# 명시적 클래스에서 로드하는 경우, 이를 사용해 보겠습니다. -if custom_pipeline is not None: - pipeline_class = get_class_from_dynamic_module( - custom_pipeline, module_file=CUSTOM_PIPELINE_FILE_NAME, cache_dir=custom_pipeline - ) -elif cls != DiffusionPipeline: - pipeline_class = cls -else: - diffusers_module = importlib.import_module(cls.__module__.split(".")[0]) - pipeline_class = getattr(diffusers_module, config_dict["_class_name"]) -``` diff --git a/docs/source/ko/using-diffusers/control_brightness.md b/docs/source/ko/using-diffusers/control_brightness.md deleted file mode 100644 index 522da736ec64..000000000000 --- a/docs/source/ko/using-diffusers/control_brightness.md +++ /dev/null @@ -1,45 +0,0 @@ -# 이미지 밝기 조절하기 - -Stable Diffusion 파이프라인은 [일반적인 디퓨전 노이즈 스케줄과 샘플 단계에 결함이 있음](https://huggingface.co/papers/2305.08891) 논문에서 설명한 것처럼 매우 밝거나 어두운 이미지를 생성하는 데는 성능이 평범합니다. 이 논문에서 제안한 솔루션은 현재 [`DDIMScheduler`]에 구현되어 있으며 이미지의 밝기를 개선하는 데 사용할 수 있습니다. - - - -💡 제안된 솔루션에 대한 자세한 내용은 위에 링크된 논문을 참고하세요! - - - -해결책 중 하나는 *v 예측값*과 *v 로스*로 모델을 훈련하는 것입니다. 다음 flag를 [`train_text_to_image.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) 또는 [`train_text_to_image_lora.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) 스크립트에 추가하여 `v_prediction`을 활성화합니다: - -```bash ---prediction_type="v_prediction" -``` - -예를 들어, `v_prediction`으로 미세 조정된 [`ptx0/pseudo-journey-v2`](https://huggingface.co/ptx0/pseudo-journey-v2) 체크포인트를 사용해 보겠습니다. - -다음으로 [`DDIMScheduler`]에서 다음 파라미터를 설정합니다: - -1. rescale_betas_zero_snr=True`, 노이즈 스케줄을 제로 터미널 신호 대 잡음비(SNR)로 재조정합니다. -2. `timestep_spacing="trailing"`, 마지막 타임스텝부터 샘플링 시작 - -```py ->>> from diffusers import DiffusionPipeline, DDIMScheduler - ->>> pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2") -# switch the scheduler in the pipeline to use the DDIMScheduler - ->>> pipeline.scheduler = DDIMScheduler.from_config( -... pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing" -... ) ->>> pipeline.to("cuda") -``` - -마지막으로 파이프라인에 대한 호출에서 `guidance_rescale`을 설정하여 과다 노출을 방지합니다: - -```py -prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" -image = pipeline(prompt, guidance_rescale=0.7).images[0] -``` - -
- -
\ No newline at end of file diff --git a/docs/source/ko/using-diffusers/custom_pipeline_examples.md b/docs/source/ko/using-diffusers/custom_pipeline_examples.md deleted file mode 100644 index 13060fb739f6..000000000000 --- a/docs/source/ko/using-diffusers/custom_pipeline_examples.md +++ /dev/null @@ -1,275 +0,0 @@ - - -# 커뮤니티 파이프라인 - -> **커뮤니티 파이프라인에 대한 자세한 내용은 [이 이슈](https://github.com/huggingface/diffusers/issues/841)를 참조하세요. - -**커뮤니티** 예제는 커뮤니티에서 추가한 추론 및 훈련 예제로 구성되어 있습니다. -다음 표를 참조하여 모든 커뮤니티 예제에 대한 개요를 확인하시기 바랍니다. **코드 예제**를 클릭하면 복사하여 붙여넣기할 수 있는 코드 예제를 확인할 수 있습니다. -커뮤니티가 예상대로 작동하지 않는 경우 이슈를 개설하고 작성자에게 핑을 보내주세요. - -| 예 | 설명 | 코드 예제 | 콜랩 |저자 | -|:---------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------:| -| CLIP Guided Stable Diffusion | CLIP 가이드 기반의 Stable Diffusion으로 텍스트에서 이미지로 생성하기 | [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion) | [![콜랩에서 열기](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) | [Suraj Patil](https://github.com/patil-suraj/) | -| One Step U-Net (Dummy) | 커뮤니티 파이프라인을 어떻게 사용해야 하는지에 대한 예시(참고 https://github.com/huggingface/diffusers/issues/841) | [One Step U-Net](#one-step-unet) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) | -| Stable Diffusion Interpolation | 서로 다른 프롬프트/시드 간 Stable Diffusion의 latent space 보간 | [Stable Diffusion Interpolation](#stable-diffusion-interpolation) | - | [Nate Raw](https://github.com/nateraw/) | -| Stable Diffusion Mega | 모든 기능을 갖춘 **하나의** Stable Diffusion 파이프라인 [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) | -| Long Prompt Weighting Stable Diffusion | 토큰 길이 제한이 없고 프롬프트에서 파싱 가중치 지원을 하는 **하나의** Stable Diffusion 파이프라인, | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion) |- | [SkyTNT](https://github.com/SkyTNT) | -| Speech to Image | 자동 음성 인식을 사용하여 텍스트를 작성하고 Stable Diffusion을 사용하여 이미지를 생성합니다. | [Speech to Image](#speech-to-image) | - | [Mikail Duzenli](https://github.com/MikailINTech) | - -커스텀 파이프라인을 불러오려면 `diffusers/examples/community`에 있는 파일 중 하나로서 `custom_pipeline` 인수를 `DiffusionPipeline`에 전달하기만 하면 됩니다. 자신만의 파이프라인이 있는 PR을 보내주시면 빠르게 병합해드리겠습니다. -```py -pipe = DiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", custom_pipeline="filename_in_the_community_folder" -) -``` - -## 사용 예시 - -### CLIP 가이드 기반의 Stable Diffusion - -모든 노이즈 제거 단계에서 추가 CLIP 모델을 통해 Stable Diffusion을 가이드함으로써 CLIP 모델 기반의 Stable Diffusion은 보다 더 사실적인 이미지를 생성을 할 수 있습니다. - -다음 코드는 약 12GB의 GPU RAM이 필요합니다. - -```python -from diffusers import DiffusionPipeline -from transformers import CLIPImageProcessor, CLIPModel -import torch - - -feature_extractor = CLIPImageProcessor.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K") -clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K", torch_dtype=torch.float16) - - -guided_pipeline = DiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - custom_pipeline="clip_guided_stable_diffusion", - clip_model=clip_model, - feature_extractor=feature_extractor, - torch_dtype=torch.float16, -) -guided_pipeline.enable_attention_slicing() -guided_pipeline = guided_pipeline.to("cuda") - -prompt = "fantasy book cover, full moon, fantasy forest landscape, golden vector elements, fantasy magic, dark light night, intricate, elegant, sharp focus, illustration, highly detailed, digital painting, concept art, matte, art by WLOP and Artgerm and Albert Bierstadt, masterpiece" - -generator = torch.Generator(device="cuda").manual_seed(0) -images = [] -for i in range(4): - image = guided_pipeline( - prompt, - num_inference_steps=50, - guidance_scale=7.5, - clip_guidance_scale=100, - num_cutouts=4, - use_cutouts=False, - generator=generator, - ).images[0] - images.append(image) - -# 이미지 로컬에 저장하기 -for i, img in enumerate(images): - img.save(f"./clip_guided_sd/image_{i}.png") -``` - -이미지` 목록에는 로컬에 저장하거나 구글 콜랩에 직접 표시할 수 있는 PIL 이미지 목록이 포함되어 있습니다. 생성된 이미지는 기본적으로 안정적인 확산을 사용하는 것보다 품질이 높은 경향이 있습니다. 예를 들어 위의 스크립트는 다음과 같은 이미지를 생성합니다: - -![clip_guidance](https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/clip_guidance/merged_clip_guidance.jpg). - -### One Step Unet - -예시 "one-step-unet"는 다음과 같이 실행할 수 있습니다. - -```python -from diffusers import DiffusionPipeline - -pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet") -pipe() -``` - -**참고**: 이 커뮤니티 파이프라인은 기능으로 유용하지 않으며 커뮤니티 파이프라인을 추가할 수 있는 방법의 예시일 뿐입니다(https://github.com/huggingface/diffusers/issues/841 참조). - -### Stable Diffusion Interpolation - -다음 코드는 최소 8GB VRAM의 GPU에서 실행할 수 있으며 약 5분 정도 소요됩니다. - -```python -from diffusers import DiffusionPipeline -import torch - -pipe = DiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - torch_dtype=torch.float16, - safety_checker=None, # Very important for videos...lots of false positives while interpolating - custom_pipeline="interpolate_stable_diffusion", -).to("cuda") -pipe.enable_attention_slicing() - -frame_filepaths = pipe.walk( - prompts=["a dog", "a cat", "a horse"], - seeds=[42, 1337, 1234], - num_interpolation_steps=16, - output_dir="./dreams", - batch_size=4, - height=512, - width=512, - guidance_scale=8.5, - num_inference_steps=50, -) -``` - -walk(...)` 함수의 출력은 `output_dir`에 정의된 대로 폴더에 저장된 이미지 목록을 반환합니다. 이 이미지를 사용하여 안정적으로 확산되는 동영상을 만들 수 있습니다. - -> 안정된 확산을 이용한 동영상 제작 방법과 더 많은 기능에 대한 자세한 내용은 https://github.com/nateraw/stable-diffusion-videos 에서 확인하시기 바랍니다. - -### Stable Diffusion Mega - -The Stable Diffusion Mega 파이프라인을 사용하면 Stable Diffusion 파이프라인의 주요 사용 사례를 단일 클래스에서 사용할 수 있습니다. -```python -#!/usr/bin/env python3 -from diffusers import DiffusionPipeline -import PIL -import requests -from io import BytesIO -import torch - - -def download_image(url): - response = requests.get(url) - return PIL.Image.open(BytesIO(response.content)).convert("RGB") - - -pipe = DiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - custom_pipeline="stable_diffusion_mega", - torch_dtype=torch.float16, -) -pipe.to("cuda") -pipe.enable_attention_slicing() - - -### Text-to-Image - -images = pipe.text2img("An astronaut riding a horse").images - -### Image-to-Image - -init_image = download_image( - "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg" -) - -prompt = "A fantasy landscape, trending on artstation" - -images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images - -### Inpainting - -img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" -mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" -init_image = download_image(img_url).resize((512, 512)) -mask_image = download_image(mask_url).resize((512, 512)) - -prompt = "a cat sitting on a bench" -images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images -``` - -위에 표시된 것처럼 하나의 파이프라인에서 '텍스트-이미지 변환', '이미지-이미지 변환', '인페인팅'을 모두 실행할 수 있습니다. - -### Long Prompt Weighting Stable Diffusion - -파이프라인을 사용하면 77개의 토큰 길이 제한 없이 프롬프트를 입력할 수 있습니다. 또한 "()"를 사용하여 단어 가중치를 높이거나 "[]"를 사용하여 단어 가중치를 낮출 수 있습니다. -또한 파이프라인을 사용하면 단일 클래스에서 Stable Diffusion 파이프라인의 주요 사용 사례를 사용할 수 있습니다. - -#### pytorch - -```python -from diffusers import DiffusionPipeline -import torch - -pipe = DiffusionPipeline.from_pretrained( - "hakurei/waifu-diffusion", custom_pipeline="lpw_stable_diffusion", torch_dtype=torch.float16 -) -pipe = pipe.to("cuda") - -prompt = "best_quality (1girl:1.3) bow bride brown_hair closed_mouth frilled_bow frilled_hair_tubes frills (full_body:1.3) fox_ear hair_bow hair_tubes happy hood japanese_clothes kimono long_sleeves red_bow smile solo tabi uchikake white_kimono wide_sleeves cherry_blossoms" -neg_prompt = "lowres, bad_anatomy, error_body, error_hair, error_arm, error_hands, bad_hands, error_fingers, bad_fingers, missing_fingers, error_legs, bad_legs, multiple_legs, missing_legs, error_lighting, error_shadow, error_reflection, text, error, extra_digit, fewer_digits, cropped, worst_quality, low_quality, normal_quality, jpeg_artifacts, signature, watermark, username, blurry" - -pipe.text2img(prompt, negative_prompt=neg_prompt, width=512, height=512, max_embeddings_multiples=3).images[0] -``` - -#### onnxruntime - -```python -from diffusers import DiffusionPipeline -import torch - -pipe = DiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - custom_pipeline="lpw_stable_diffusion_onnx", - revision="onnx", - provider="CUDAExecutionProvider", -) - -prompt = "a photo of an astronaut riding a horse on mars, best quality" -neg_prompt = "lowres, bad anatomy, error body, error hair, error arm, error hands, bad hands, error fingers, bad fingers, missing fingers, error legs, bad legs, multiple legs, missing legs, error lighting, error shadow, error reflection, text, error, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry" - -pipe.text2img(prompt, negative_prompt=neg_prompt, width=512, height=512, max_embeddings_multiples=3).images[0] -``` - -토큰 인덱스 시퀀스 길이가 이 모델에 지정된 최대 시퀀스 길이보다 길면(*** > 77). 이 시퀀스를 모델에서 실행하면 인덱싱 오류가 발생합니다`. 정상적인 현상이니 걱정하지 마세요. -### Speech to Image - -다음 코드는 사전학습된 OpenAI whisper-small과 Stable Diffusion을 사용하여 오디오 샘플에서 이미지를 생성할 수 있습니다. -```Python -import torch - -import matplotlib.pyplot as plt -from datasets import load_dataset -from diffusers import DiffusionPipeline -from transformers import ( - WhisperForConditionalGeneration, - WhisperProcessor, -) - - -device = "cuda" if torch.cuda.is_available() else "cpu" - -ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - -audio_sample = ds[3] - -text = audio_sample["text"].lower() -speech_data = audio_sample["audio"]["array"] - -model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device) -processor = WhisperProcessor.from_pretrained("openai/whisper-small") - -diffuser_pipeline = DiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - custom_pipeline="speech_to_image_diffusion", - speech_model=model, - speech_processor=processor, - - torch_dtype=torch.float16, -) - -diffuser_pipeline.enable_attention_slicing() -diffuser_pipeline = diffuser_pipeline.to(device) - -output = diffuser_pipeline(speech_data) -plt.imshow(output.images[0]) -``` -위 예시는 다음의 결과 이미지를 보입니다. - -![image](https://user-images.githubusercontent.com/45072645/196901736-77d9c6fc-63ee-4072-90b0-dc8b903d63e3.png) \ No newline at end of file diff --git a/docs/source/ko/using-diffusers/diffedit.md b/docs/source/ko/using-diffusers/diffedit.md new file mode 100644 index 000000000000..74b9e9783155 --- /dev/null +++ b/docs/source/ko/using-diffusers/diffedit.md @@ -0,0 +1,285 @@ + + +# DiffEdit + +[[open-in-colab]] + +이미지 편집을 하려면 일반적으로 편집할 영역의 마스크를 제공해야 합니다. DiffEdit는 텍스트 쿼리를 기반으로 마스크를 자동으로 생성하므로 이미지 편집 소프트웨어 없이도 마스크를 만들기가 전반적으로 더 쉬워집니다. DiffEdit 알고리즘은 세 단계로 작동합니다: + +1. Diffusion 모델이 일부 쿼리 텍스트와 참조 텍스트를 조건부로 이미지의 노이즈를 제거하여 이미지의 여러 영역에 대해 서로 다른 노이즈 추정치를 생성하고, 그 차이를 사용하여 쿼리 텍스트와 일치하도록 이미지의 어느 영역을 변경해야 하는지 식별하기 위한 마스크를 추론합니다. +2. 입력 이미지가 DDIM을 사용하여 잠재 공간으로 인코딩됩니다. +3. 마스크 외부의 픽셀이 입력 이미지와 동일하게 유지되도록 마스크를 가이드로 사용하여 텍스트 쿼리에 조건이 지정된 diffusion 모델로 latents를 디코딩합니다. + +이 가이드에서는 마스크를 수동으로 만들지 않고 DiffEdit를 사용하여 이미지를 편집하는 방법을 설명합니다. + +시작하기 전에 다음 라이브러리가 설치되어 있는지 확인하세요: + +```py +# Colab에서 필요한 라이브러리를 설치하기 위해 주석을 제외하세요 +#!pip install -q diffusers transformers accelerate +``` + +[`StableDiffusionDiffEditPipeline`]에는 이미지 마스크와 부분적으로 반전된 latents 집합이 필요합니다. 이미지 마스크는 [`~StableDiffusionDiffEditPipeline.generate_mask`] 함수에서 생성되며, 두 개의 파라미터인 `source_prompt`와 `target_prompt`가 포함됩니다. 이 매개변수는 이미지에서 무엇을 편집할지 결정합니다. 예를 들어, *과일* 한 그릇을 *배* 한 그릇으로 변경하려면 다음과 같이 하세요: + +```py +source_prompt = "a bowl of fruits" +target_prompt = "a bowl of pears" +``` + +부분적으로 반전된 latents는 [`~StableDiffusionDiffEditPipeline.invert`] 함수에서 생성되며, 일반적으로 이미지를 설명하는 `prompt` 또는 *캡션*을 포함하는 것이 inverse latent sampling 프로세스를 가이드하는 데 도움이 됩니다. 캡션은 종종 `source_prompt`가 될 수 있지만, 다른 텍스트 설명으로 자유롭게 실험해 보세요! + +파이프라인, 스케줄러, 역 스케줄러를 불러오고 메모리 사용량을 줄이기 위해 몇 가지 최적화를 활성화해 보겠습니다: + +```py +import torch +from diffusers import DDIMScheduler, DDIMInverseScheduler, StableDiffusionDiffEditPipeline + +pipeline = StableDiffusionDiffEditPipeline.from_pretrained( + "stabilityai/stable-diffusion-2-1", + torch_dtype=torch.float16, + safety_checker=None, + use_safetensors=True, +) +pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config) +pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config) +pipeline.enable_model_cpu_offload() +pipeline.enable_vae_slicing() +``` + +수정하기 위한 이미지를 불러옵니다: + +```py +from diffusers.utils import load_image, make_image_grid + +img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png" +raw_image = load_image(img_url).resize((768, 768)) +raw_image +``` + +이미지 마스크를 생성하기 위해 [`~StableDiffusionDiffEditPipeline.generate_mask`] 함수를 사용합니다. 이미지에서 편집할 내용을 지정하기 위해 `source_prompt`와 `target_prompt`를 전달해야 합니다: + +```py +from PIL import Image + +source_prompt = "a bowl of fruits" +target_prompt = "a basket of pears" +mask_image = pipeline.generate_mask( + image=raw_image, + source_prompt=source_prompt, + target_prompt=target_prompt, +) +Image.fromarray((mask_image.squeeze()*255).astype("uint8"), "L").resize((768, 768)) +``` + +다음으로, 반전된 latents를 생성하고 이미지를 묘사하는 캡션에 전달합니다: + +```py +inv_latents = pipeline.invert(prompt=source_prompt, image=raw_image).latents +``` + +마지막으로, 이미지 마스크와 반전된 latents를 파이프라인에 전달합니다. `target_prompt`는 이제 `prompt`가 되며, `source_prompt`는 `negative_prompt`로 사용됩니다. + +```py +output_image = pipeline( + prompt=target_prompt, + mask_image=mask_image, + image_latents=inv_latents, + negative_prompt=source_prompt, +).images[0] +mask_image = Image.fromarray((mask_image.squeeze()*255).astype("uint8"), "L").resize((768, 768)) +make_image_grid([raw_image, mask_image, output_image], rows=1, cols=3) +``` + +
+
+ +
original image
+
+
+ +
edited image
+
+
+ +## Source와 target 임베딩 생성하기 + +Source와 target 임베딩은 수동으로 생성하는 대신 [Flan-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5) 모델을 사용하여 자동으로 생성할 수 있습니다. + +Flan-T5 모델과 토크나이저를 🤗 Transformers 라이브러리에서 불러옵니다: + +```py +import torch +from transformers import AutoTokenizer, T5ForConditionalGeneration + +tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large") +model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto", torch_dtype=torch.float16) +``` + +모델에 프롬프트할 source와 target 프롬프트를 생성하기 위해 초기 텍스트들을 제공합니다. + +```py +source_concept = "bowl" +target_concept = "basket" + +source_text = f"Provide a caption for images containing a {source_concept}. " +"The captions should be in English and should be no longer than 150 characters." + +target_text = f"Provide a caption for images containing a {target_concept}. " +"The captions should be in English and should be no longer than 150 characters." +``` + +다음으로, 프롬프트들을 생성하기 위해 유틸리티 함수를 생성합니다. + +```py +@torch.no_grad() +def generate_prompts(input_prompt): + input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to("cuda") + + outputs = model.generate( + input_ids, temperature=0.8, num_return_sequences=16, do_sample=True, max_new_tokens=128, top_k=10 + ) + return tokenizer.batch_decode(outputs, skip_special_tokens=True) + +source_prompts = generate_prompts(source_text) +target_prompts = generate_prompts(target_text) +print(source_prompts) +print(target_prompts) +``` + + + +다양한 품질의 텍스트를 생성하는 전략에 대해 자세히 알아보려면 [생성 전략](https://huggingface.co/docs/transformers/main/en/generation_strategies) 가이드를 참조하세요. + + + +텍스트 인코딩을 위해 [`StableDiffusionDiffEditPipeline`]에서 사용하는 텍스트 인코더 모델을 불러옵니다. 텍스트 인코더를 사용하여 텍스트 임베딩을 계산합니다: + +```py +import torch +from diffusers import StableDiffusionDiffEditPipeline + +pipeline = StableDiffusionDiffEditPipeline.from_pretrained( + "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16, use_safetensors=True +) +pipeline.enable_model_cpu_offload() +pipeline.enable_vae_slicing() + +@torch.no_grad() +def embed_prompts(sentences, tokenizer, text_encoder, device="cuda"): + embeddings = [] + for sent in sentences: + text_inputs = tokenizer( + sent, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + prompt_embeds = text_encoder(text_input_ids.to(device), attention_mask=None)[0] + embeddings.append(prompt_embeds) + return torch.concatenate(embeddings, dim=0).mean(dim=0).unsqueeze(0) + +source_embeds = embed_prompts(source_prompts, pipeline.tokenizer, pipeline.text_encoder) +target_embeds = embed_prompts(target_prompts, pipeline.tokenizer, pipeline.text_encoder) +``` + +마지막으로, 임베딩을 [`~StableDiffusionDiffEditPipeline.generate_mask`] 및 [`~StableDiffusionDiffEditPipeline.invert`] 함수와 파이프라인에 전달하여 이미지를 생성합니다: + +```diff + from diffusers import DDIMInverseScheduler, DDIMScheduler + from diffusers.utils import load_image, make_image_grid + from PIL import Image + + pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config) + pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config) + + img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png" + raw_image = load_image(img_url).resize((768, 768)) + + mask_image = pipeline.generate_mask( + image=raw_image, +- source_prompt=source_prompt, +- target_prompt=target_prompt, ++ source_prompt_embeds=source_embeds, ++ target_prompt_embeds=target_embeds, + ) + + inv_latents = pipeline.invert( +- prompt=source_prompt, ++ prompt_embeds=source_embeds, + image=raw_image, + ).latents + + output_image = pipeline( + mask_image=mask_image, + image_latents=inv_latents, +- prompt=target_prompt, +- negative_prompt=source_prompt, ++ prompt_embeds=target_embeds, ++ negative_prompt_embeds=source_embeds, + ).images[0] + mask_image = Image.fromarray((mask_image.squeeze()*255).astype("uint8"), "L") + make_image_grid([raw_image, mask_image, output_image], rows=1, cols=3) +``` + +## 반전을 위한 캡션 생성하기 + +`source_prompt`를 캡션으로 사용하여 부분적으로 반전된 latents를 생성할 수 있지만, [BLIP](https://huggingface.co/docs/transformers/model_doc/blip) 모델을 사용하여 캡션을 자동으로 생성할 수도 있습니다. + +🤗 Transformers 라이브러리에서 BLIP 모델과 프로세서를 불러옵니다: + +```py +import torch +from transformers import BlipForConditionalGeneration, BlipProcessor + +processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") +model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float16, low_cpu_mem_usage=True) +``` + +입력 이미지에서 캡션을 생성하는 유틸리티 함수를 만듭니다: + +```py +@torch.no_grad() +def generate_caption(images, caption_generator, caption_processor): + text = "a photograph of" + + inputs = caption_processor(images, text, return_tensors="pt").to(device="cuda", dtype=caption_generator.dtype) + caption_generator.to("cuda") + outputs = caption_generator.generate(**inputs, max_new_tokens=128) + + # 캡션 generator 오프로드 + caption_generator.to("cpu") + + caption = caption_processor.batch_decode(outputs, skip_special_tokens=True)[0] + return caption +``` + +입력 이미지를 불러오고 `generate_caption` 함수를 사용하여 해당 이미지에 대한 캡션을 생성합니다: + +```py +from diffusers.utils import load_image + +img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png" +raw_image = load_image(img_url).resize((768, 768)) +caption = generate_caption(raw_image, model, processor) +``` + +
+
+ +
generated caption: "a photograph of a bowl of fruit on a table"
+
+
+ +이제 캡션을 [`~StableDiffusionDiffEditPipeline.invert`] 함수에 놓아 부분적으로 반전된 latents를 생성할 수 있습니다! diff --git a/docs/source/ko/using-diffusers/kandinsky.md b/docs/source/ko/using-diffusers/kandinsky.md new file mode 100644 index 000000000000..cc554c67f989 --- /dev/null +++ b/docs/source/ko/using-diffusers/kandinsky.md @@ -0,0 +1,768 @@ + + +# Kandinsky + +[[open-in-colab]] + +Kandinsky 모델은 일련의 다국어 text-to-image 생성 모델입니다. Kandinsky 2.0 모델은 두 개의 다국어 텍스트 인코더를 사용하고 그 결과를 연결해 UNet에 사용됩니다. + +[Kandinsky 2.1](../api/pipelines/kandinsky)은 텍스트와 이미지 임베딩 간의 매핑을 생성하는 image prior 모델([`CLIP`](https://huggingface.co/docs/transformers/model_doc/clip))을 포함하도록 아키텍처를 변경했습니다. 이 매핑은 더 나은 text-image alignment를 제공하며, 학습 중에 텍스트 임베딩과 함께 사용되어 더 높은 품질의 결과를 가져옵니다. 마지막으로, Kandinsky 2.1은 spatial conditional 정규화 레이어를 추가하여 사실감을 높여주는 [Modulating Quantized Vectors (MoVQ)](https://huggingface.co/papers/2209.09002) 디코더를 사용하여 latents를 이미지로 디코딩합니다. + +[Kandinsky 2.2](../api/pipelines/kandinsky_v22)는 image prior 모델의 이미지 인코더를 더 큰 CLIP-ViT-G 모델로 교체하여 품질을 개선함으로써 이전 모델을 개선했습니다. 또한 image prior 모델은 해상도와 종횡비가 다른 이미지로 재훈련되어 더 높은 해상도의 이미지와 다양한 이미지 크기를 생성합니다. + +[Kandinsky 3](../api/pipelines/kandinsky3)는 아키텍처를 단순화하고 prior 모델과 diffusion 모델을 포함하는 2단계 생성 프로세스에서 벗어나고 있습니다. 대신, Kandinsky 3는 [Flan-UL2](https://huggingface.co/google/flan-ul2)를 사용하여 텍스트를 인코딩하고, [BigGan-deep](https://hf.co/papers/1809.11096) 블록이 포함된 UNet을 사용하며, [Sber-MoVQGAN](https://github.com/ai-forever/MoVQGAN)을 사용하여 latents를 이미지로 디코딩합니다. 텍스트 이해와 생성된 이미지 품질은 주로 더 큰 텍스트 인코더와 UNet을 사용함으로써 달성됩니다. + +이 가이드에서는 text-to-image, image-to-image, 인페인팅, 보간 등을 위해 Kandinsky 모델을 사용하는 방법을 설명합니다. + +시작하기 전에 다음 라이브러리가 설치되어 있는지 확인하세요: + +```py +# Colab에서 필요한 라이브러리를 설치하기 위해 주석을 제외하세요 +#!pip install -q diffusers transformers accelerate +``` + + + +Kandinsky 2.1과 2.2의 사용법은 매우 유사합니다! 유일한 차이점은 Kandinsky 2.2는 latents를 디코딩할 때 `프롬프트`를 입력으로 받지 않는다는 것입니다. 대신, Kandinsky 2.2는 디코딩 중에는 `image_embeds`만 받아들입니다. + +
+ +Kandinsky 3는 더 간결한 아키텍처를 가지고 있으며 prior 모델이 필요하지 않습니다. 즉, [Stable Diffusion XL](sdxl)과 같은 다른 diffusion 모델과 사용법이 동일합니다. + +
+ +## Text-to-image + +모든 작업에 Kandinsky 모델을 사용하려면 항상 프롬프트를 인코딩하고 이미지 임베딩을 생성하는 prior 파이프라인을 설정하는 것부터 시작해야 합니다. 이전 파이프라인은 negative 프롬프트 `""`에 해당하는 `negative_image_embeds`도 생성합니다. 더 나은 결과를 얻으려면 이전 파이프라인에 실제 `negative_prompt`를 전달할 수 있지만, 이렇게 하면 prior 파이프라인의 유효 배치 크기가 2배로 증가합니다. + + + + +```py +from diffusers import KandinskyPriorPipeline, KandinskyPipeline +import torch + +prior_pipeline = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16).to("cuda") +pipeline = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16).to("cuda") + +prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting" +negative_prompt = "low quality, bad quality" # negative 프롬프트 포함은 선택적이지만, 보통 결과는 더 좋습니다 +image_embeds, negative_image_embeds = prior_pipeline(prompt, negative_prompt, guidance_scale=1.0).to_tuple() +``` + +이제 모든 프롬프트와 임베딩을 [`KandinskyPipeline`]에 전달하여 이미지를 생성합니다: + +```py +image = pipeline(prompt, image_embeds=image_embeds, negative_prompt=negative_prompt, negative_image_embeds=negative_image_embeds, height=768, width=768).images[0] +image +``` + +
+ +
+ +
+ + +```py +from diffusers import KandinskyV22PriorPipeline, KandinskyV22Pipeline +import torch + +prior_pipeline = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16).to("cuda") +pipeline = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16).to("cuda") + +prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting" +negative_prompt = "low quality, bad quality" # negative 프롬프트 포함은 선택적이지만, 보통 결과는 더 좋습니다 +image_embeds, negative_image_embeds = prior_pipeline(prompt, guidance_scale=1.0).to_tuple() +``` + +이미지 생성을 위해 `image_embeds`와 `negative_image_embeds`를 [`KandinskyV22Pipeline`]에 전달합니다: + +```py +image = pipeline(image_embeds=image_embeds, negative_image_embeds=negative_image_embeds, height=768, width=768).images[0] +image +``` + +
+ +
+ +
+ + +Kandinsky 3는 prior 모델이 필요하지 않으므로 [`Kandinsky3Pipeline`]을 직접 불러오고 이미지 생성 프롬프트를 전달할 수 있습니다: + +```py +from diffusers import Kandinsky3Pipeline +import torch + +pipeline = Kandinsky3Pipeline.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16) +pipeline.enable_model_cpu_offload() + +prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting" +image = pipeline(prompt).images[0] +image +``` + + +
+ +🤗 Diffusers는 또한 [`KandinskyCombinedPipeline`] 및 [`KandinskyV22CombinedPipeline`]이 포함된 end-to-end API를 제공하므로 prior 파이프라인과 text-to-image 변환 파이프라인을 별도로 불러올 필요가 없습니다. 결합된 파이프라인은 prior 모델과 디코더를 모두 자동으로 불러옵니다. 원하는 경우 `prior_guidance_scale` 및 `prior_num_inference_steps` 매개 변수를 사용하여 prior 파이프라인에 대해 다른 값을 설정할 수 있습니다. + +내부에서 결합된 파이프라인을 자동으로 호출하려면 [`AutoPipelineForText2Image`]를 사용합니다: + + + + +```py +from diffusers import AutoPipelineForText2Image +import torch + +pipeline = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16) +pipeline.enable_model_cpu_offload() + +prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting" +negative_prompt = "low quality, bad quality" + +image = pipeline(prompt=prompt, negative_prompt=negative_prompt, prior_guidance_scale=1.0, guidance_scale=4.0, height=768, width=768).images[0] +image +``` + + + + +```py +from diffusers import AutoPipelineForText2Image +import torch + +pipeline = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16) +pipeline.enable_model_cpu_offload() + +prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting" +negative_prompt = "low quality, bad quality" + +image = pipeline(prompt=prompt, negative_prompt=negative_prompt, prior_guidance_scale=1.0, guidance_scale=4.0, height=768, width=768).images[0] +image +``` + + + + +## Image-to-image + +Image-to-image 경우, 초기 이미지와 텍스트 프롬프트를 전달하여 파이프라인에 이미지를 conditioning합니다. Prior 파이프라인을 불러오는 것으로 시작합니다: + + + + +```py +import torch +from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline + +prior_pipeline = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16, use_safetensors=True).to("cuda") +pipeline = KandinskyImg2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16, use_safetensors=True).to("cuda") +``` + + + + +```py +import torch +from diffusers import KandinskyV22Img2ImgPipeline, KandinskyPriorPipeline + +prior_pipeline = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16, use_safetensors=True).to("cuda") +pipeline = KandinskyV22Img2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, use_safetensors=True).to("cuda") +``` + + + + +Kandinsky 3는 prior 모델이 필요하지 않으므로 image-to-image 파이프라인을 직접 불러올 수 있습니다: + +```py +from diffusers import Kandinsky3Img2ImgPipeline +from diffusers.utils import load_image +import torch + +pipeline = Kandinsky3Img2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16) +pipeline.enable_model_cpu_offload() +``` + + + + +Conditioning할 이미지를 다운로드합니다: + +```py +from diffusers.utils import load_image + +# 이미지 다운로드 +url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg" +original_image = load_image(url) +original_image = original_image.resize((768, 512)) +``` + +
+ +
+ +Prior 파이프라인으로 `image_embeds`와 `negative_image_embeds`를 생성합니다: + +```py +prompt = "A fantasy landscape, Cinematic lighting" +negative_prompt = "low quality, bad quality" + +image_embeds, negative_image_embeds = prior_pipeline(prompt, negative_prompt).to_tuple() +``` + +이제 원본 이미지와 모든 프롬프트 및 임베딩을 파이프라인으로 전달하여 이미지를 생성합니다: + + + + +```py +from diffusers.utils import make_image_grid + +image = pipeline(prompt, negative_prompt=negative_prompt, image=original_image, image_embeds=image_embeds, negative_image_embeds=negative_image_embeds, height=768, width=768, strength=0.3).images[0] +make_image_grid([original_image.resize((512, 512)), image.resize((512, 512))], rows=1, cols=2) +``` + +
+ +
+ +
+ + +```py +from diffusers.utils import make_image_grid + +image = pipeline(image=original_image, image_embeds=image_embeds, negative_image_embeds=negative_image_embeds, height=768, width=768, strength=0.3).images[0] +make_image_grid([original_image.resize((512, 512)), image.resize((512, 512))], rows=1, cols=2) +``` + +
+ +
+ +
+ + +```py +image = pipeline(prompt, negative_prompt=negative_prompt, image=image, strength=0.75, num_inference_steps=25).images[0] +image +``` + + +
+ +또한 🤗 Diffusers에서는 [`KandinskyImg2ImgCombinedPipeline`] 및 [`KandinskyV22Img2ImgCombinedPipeline`]이 포함된 end-to-end API를 제공하므로 prior 파이프라인과 image-to-image 파이프라인을 별도로 불러올 필요가 없습니다. 결합된 파이프라인은 prior 모델과 디코더를 모두 자동으로 불러옵니다. 원하는 경우 `prior_guidance_scale` 및 `prior_num_inference_steps` 매개 변수를 사용하여 이전 파이프라인에 대해 다른 값을 설정할 수 있습니다. + +내부에서 결합된 파이프라인을 자동으로 호출하려면 [`AutoPipelineForImage2Image`]를 사용합니다: + + + + +```py +from diffusers import AutoPipelineForImage2Image +from diffusers.utils import make_image_grid, load_image +import torch + +pipeline = AutoPipelineForImage2Image.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16, use_safetensors=True) +pipeline.enable_model_cpu_offload() + +prompt = "A fantasy landscape, Cinematic lighting" +negative_prompt = "low quality, bad quality" + +url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg" +original_image = load_image(url) + +original_image.thumbnail((768, 768)) + +image = pipeline(prompt=prompt, negative_prompt=negative_prompt, image=original_image, strength=0.3).images[0] +make_image_grid([original_image.resize((512, 512)), image.resize((512, 512))], rows=1, cols=2) +``` + + + + +```py +from diffusers import AutoPipelineForImage2Image +from diffusers.utils import make_image_grid, load_image +import torch + +pipeline = AutoPipelineForImage2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16) +pipeline.enable_model_cpu_offload() + +prompt = "A fantasy landscape, Cinematic lighting" +negative_prompt = "low quality, bad quality" + +url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg" +original_image = load_image(url) + +original_image.thumbnail((768, 768)) + +image = pipeline(prompt=prompt, negative_prompt=negative_prompt, image=original_image, strength=0.3).images[0] +make_image_grid([original_image.resize((512, 512)), image.resize((512, 512))], rows=1, cols=2) +``` + + + + +## Inpainting + + + +⚠️ Kandinsky 모델은 이제 검은색 픽셀 대신 ⬜️ **흰색 픽셀**을 사용하여 마스크 영역을 표현합니다. 프로덕션에서 [`KandinskyInpaintPipeline`]을 사용하는 경우 흰색 픽셀을 사용하도록 마스크를 변경해야 합니다: + +```py +# PIL 입력에 대해 +import PIL.ImageOps +mask = PIL.ImageOps.invert(mask) + +# PyTorch와 NumPy 입력에 대해 +mask = 1 - mask +``` + + + +인페인팅에서는 원본 이미지, 원본 이미지에서 대체할 영역의 마스크, 인페인팅할 내용에 대한 텍스트 프롬프트가 필요합니다. Prior 파이프라인을 불러옵니다: + + + + +```py +from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline +from diffusers.utils import load_image, make_image_grid +import torch +import numpy as np +from PIL import Image + +prior_pipeline = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16, use_safetensors=True).to("cuda") +pipeline = KandinskyInpaintPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16, use_safetensors=True).to("cuda") +``` + + + + +```py +from diffusers import KandinskyV22InpaintPipeline, KandinskyV22PriorPipeline +from diffusers.utils import load_image, make_image_grid +import torch +import numpy as np +from PIL import Image + +prior_pipeline = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16, use_safetensors=True).to("cuda") +pipeline = KandinskyV22InpaintPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16, use_safetensors=True).to("cuda") +``` + + + + +초기 이미지를 불러오고 마스크를 생성합니다: + +```py +init_image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png") +mask = np.zeros((768, 768), dtype=np.float32) +# mask area above cat's head +mask[:250, 250:-250] = 1 +``` + +Prior 파이프라인으로 임베딩을 생성합니다: + +```py +prompt = "a hat" +prior_output = prior_pipeline(prompt) +``` + +이제 이미지 생성을 위해 초기 이미지, 마스크, 프롬프트와 임베딩을 파이프라인에 전달합니다: + + + + +```py +output_image = pipeline(prompt, image=init_image, mask_image=mask, **prior_output, height=768, width=768, num_inference_steps=150).images[0] +mask = Image.fromarray((mask*255).astype('uint8'), 'L') +make_image_grid([init_image, mask, output_image], rows=1, cols=3) +``` + +
+ +
+ +
+ + +```py +output_image = pipeline(image=init_image, mask_image=mask, **prior_output, height=768, width=768, num_inference_steps=150).images[0] +mask = Image.fromarray((mask*255).astype('uint8'), 'L') +make_image_grid([init_image, mask, output_image], rows=1, cols=3) +``` + +
+ +
+ +
+
+ +[`KandinskyInpaintCombinedPipeline`] 및 [`KandinskyV22InpaintCombinedPipeline`]을 사용하여 내부에서 prior 및 디코더 파이프라인을 함께 호출할 수 있습니다. 이를 위해 [`AutoPipelineForInpainting`]을 사용합니다: + + + + +```py +import torch +import numpy as np +from PIL import Image +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image, make_image_grid + +pipe = AutoPipelineForInpainting.from_pretrained("kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16) +pipe.enable_model_cpu_offload() + +init_image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png") +mask = np.zeros((768, 768), dtype=np.float32) +# 고양이 머리 위 마스크 지역 +mask[:250, 250:-250] = 1 +prompt = "a hat" + +output_image = pipe(prompt=prompt, image=init_image, mask_image=mask).images[0] +mask = Image.fromarray((mask*255).astype('uint8'), 'L') +make_image_grid([init_image, mask, output_image], rows=1, cols=3) +``` + + + + +```py +import torch +import numpy as np +from PIL import Image +from diffusers import AutoPipelineForInpainting +from diffusers.utils import load_image, make_image_grid + +pipe = AutoPipelineForInpainting.from_pretrained("kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16) +pipe.enable_model_cpu_offload() + +init_image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png") +mask = np.zeros((768, 768), dtype=np.float32) +# 고양이 머리 위 마스크 영역 +mask[:250, 250:-250] = 1 +prompt = "a hat" + +output_image = pipe(prompt=prompt, image=original_image, mask_image=mask).images[0] +mask = Image.fromarray((mask*255).astype('uint8'), 'L') +make_image_grid([init_image, mask, output_image], rows=1, cols=3) +``` + + + + +## Interpolation (보간) + +Interpolation(보간)을 사용하면 이미지와 텍스트 임베딩 사이의 latent space를 탐색할 수 있어 prior 모델의 중간 결과물을 볼 수 있는 멋진 방법입니다. Prior 파이프라인과 보간하려는 두 개의 이미지를 불러옵니다: + + + + +```py +from diffusers import KandinskyPriorPipeline, KandinskyPipeline +from diffusers.utils import load_image, make_image_grid +import torch + +prior_pipeline = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16, use_safetensors=True).to("cuda") +img_1 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png") +img_2 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/starry_night.jpeg") +make_image_grid([img_1.resize((512,512)), img_2.resize((512,512))], rows=1, cols=2) +``` + + + + +```py +from diffusers import KandinskyV22PriorPipeline, KandinskyV22Pipeline +from diffusers.utils import load_image, make_image_grid +import torch + +prior_pipeline = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16, use_safetensors=True).to("cuda") +img_1 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png") +img_2 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/starry_night.jpeg") +make_image_grid([img_1.resize((512,512)), img_2.resize((512,512))], rows=1, cols=2) +``` + + + + +
+
+ +
a cat
+
+
+ +
Van Gogh's Starry Night painting
+
+
+ +보간할 텍스트 또는 이미지를 지정하고 각 텍스트 또는 이미지에 대한 가중치를 설정합니다. 가중치를 실험하여 보간에 어떤 영향을 미치는지 확인하세요! + +```py +images_texts = ["a cat", img_1, img_2] +weights = [0.3, 0.3, 0.4] +``` + +`interpolate` 함수를 호출하여 임베딩을 생성한 다음, 파이프라인으로 전달하여 이미지를 생성합니다: + + + + +```py +# 프롬프트는 빈칸으로 남겨도 됩니다 +prompt = "" +prior_out = prior_pipeline.interpolate(images_texts, weights) + +pipeline = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16, use_safetensors=True).to("cuda") + +image = pipeline(prompt, **prior_out, height=768, width=768).images[0] +image +``` + +
+ +
+ +
+ + +```py +# 프롬프트는 빈칸으로 남겨도 됩니다 +prompt = "" +prior_out = prior_pipeline.interpolate(images_texts, weights) + +pipeline = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, use_safetensors=True).to("cuda") + +image = pipeline(prompt, **prior_out, height=768, width=768).images[0] +image +``` + +
+ +
+ +
+
+ +## ControlNet + + + +⚠️ ControlNet은 Kandinsky 2.2에서만 지원됩니다! + + + +ControlNet을 사용하면 depth map이나 edge detection와 같은 추가 입력을 통해 사전학습된 large diffusion 모델을 conditioning할 수 있습니다. 예를 들어, 모델이 depth map의 구조를 이해하고 보존할 수 있도록 깊이 맵으로 Kandinsky 2.2를 conditioning할 수 있습니다. + +이미지를 불러오고 depth map을 추출해 보겠습니다: + +```py +from diffusers.utils import load_image + +img = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png" +).resize((768, 768)) +img +``` + +
+ +
+ +그런 다음 🤗 Transformers의 `depth-estimation` [`~transformers.Pipeline`]을 사용하여 이미지를 처리해 depth map을 구할 수 있습니다: + +```py +import torch +import numpy as np + +from transformers import pipeline + +def make_hint(image, depth_estimator): + image = depth_estimator(image)["depth"] + image = np.array(image) + image = image[:, :, None] + image = np.concatenate([image, image, image], axis=2) + detected_map = torch.from_numpy(image).float() / 255.0 + hint = detected_map.permute(2, 0, 1) + return hint + +depth_estimator = pipeline("depth-estimation") +hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda") +``` + +### Text-to-image [[controlnet-text-to-image]] + +Prior 파이프라인과 [`KandinskyV22ControlnetPipeline`]를 불러옵니다: + +```py +from diffusers import KandinskyV22PriorPipeline, KandinskyV22ControlnetPipeline + +prior_pipeline = KandinskyV22PriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16, use_safetensors=True +).to("cuda") + +pipeline = KandinskyV22ControlnetPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16 +).to("cuda") +``` + +프롬프트와 negative 프롬프트로 이미지 임베딩을 생성합니다: + +```py +prompt = "A robot, 4k photo" +negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature" + +generator = torch.Generator(device="cuda").manual_seed(43) + +image_emb, zero_image_emb = prior_pipeline( + prompt=prompt, negative_prompt=negative_prior_prompt, generator=generator +).to_tuple() +``` + +마지막으로 이미지 임베딩과 depth 이미지를 [`KandinskyV22ControlnetPipeline`]에 전달하여 이미지를 생성합니다: + +```py +image = pipeline(image_embeds=image_emb, negative_image_embeds=zero_image_emb, hint=hint, num_inference_steps=50, generator=generator, height=768, width=768).images[0] +image +``` + +
+ +
+ +### Image-to-image [[controlnet-image-to-image]] + +ControlNet을 사용한 image-to-image의 경우, 다음을 사용할 필요가 있습니다: + +- [`KandinskyV22PriorEmb2EmbPipeline`]로 텍스트 프롬프트와 이미지에서 이미지 임베딩을 생성합니다. +- [`KandinskyV22ControlnetImg2ImgPipeline`]로 초기 이미지와 이미지 임베딩에서 이미지를 생성합니다. + +🤗 Transformers에서 `depth-estimation` [`~transformers.Pipeline`]을 사용하여 고양이의 초기 이미지의 depth map을 처리해 추출합니다: + +```py +import torch +import numpy as np + +from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22ControlnetImg2ImgPipeline +from diffusers.utils import load_image +from transformers import pipeline + +img = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png" +).resize((768, 768)) + +def make_hint(image, depth_estimator): + image = depth_estimator(image)["depth"] + image = np.array(image) + image = image[:, :, None] + image = np.concatenate([image, image, image], axis=2) + detected_map = torch.from_numpy(image).float() / 255.0 + hint = detected_map.permute(2, 0, 1) + return hint + +depth_estimator = pipeline("depth-estimation") +hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda") +``` + +Prior 파이프라인과 [`KandinskyV22ControlnetImg2ImgPipeline`]을 불러옵니다: + +```py +prior_pipeline = KandinskyV22PriorEmb2EmbPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16, use_safetensors=True +).to("cuda") + +pipeline = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16 +).to("cuda") +``` + +텍스트 프롬프트와 초기 이미지를 이전 파이프라인에 전달하여 이미지 임베딩을 생성합니다: + +```py +prompt = "A robot, 4k photo" +negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature" + +generator = torch.Generator(device="cuda").manual_seed(43) + +img_emb = prior_pipeline(prompt=prompt, image=img, strength=0.85, generator=generator) +negative_emb = prior_pipeline(prompt=negative_prior_prompt, image=img, strength=1, generator=generator) +``` + +이제 [`KandinskyV22ControlnetImg2ImgPipeline`]을 실행하여 초기 이미지와 이미지 임베딩으로부터 이미지를 생성할 수 있습니다: + +```py +image = pipeline(image=img, strength=0.5, image_embeds=img_emb.image_embeds, negative_image_embeds=negative_emb.image_embeds, hint=hint, num_inference_steps=50, generator=generator, height=768, width=768).images[0] +make_image_grid([img.resize((512, 512)), image.resize((512, 512))], rows=1, cols=2) +``` + +
+ +
+ +## 최적화 + +Kandinsky는 mapping을 생성하기 위한 prior 파이프라인과 latents를 이미지로 디코딩하기 위한 두 번째 파이프라인이 필요하다는 점에서 독특합니다. 대부분의 계산이 두 번째 파이프라인에서 이루어지므로 최적화의 노력은 두 번째 파이프라인에 집중되어야 합니다. 다음은 추론 중 Kandinsky키를 개선하기 위한 몇 가지 팁입니다. + +1. PyTorch < 2.0을 사용할 경우 [xFormers](../optimization/xformers)을 활성화합니다. + +```diff + from diffusers import DiffusionPipeline + import torch + + pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16) ++ pipe.enable_xformers_memory_efficient_attention() +``` + +2. PyTorch >= 2.0을 사용할 경우 `torch.compile`을 활성화하여 scaled dot-product attention (SDPA)를 자동으로 사용하도록 합니다: + +```diff + pipe.unet.to(memory_format=torch.channels_last) ++ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) +``` + +이는 attention processor를 명시적으로 [`~models.attention_processor.AttnAddedKVProcessor2_0`]을 사용하도록 설정하는 것과 동일합니다: + +```py +from diffusers.models.attention_processor import AttnAddedKVProcessor2_0 + +pipe.unet.set_attn_processor(AttnAddedKVProcessor2_0()) +``` + +3. 메모리 부족 오류를 방지하기 위해 [`~KandinskyPriorPipeline.enable_model_cpu_offload`]를 사용하여 모델을 CPU로 오프로드합니다: + +```diff + from diffusers import DiffusionPipeline + import torch + + pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16) ++ pipe.enable_model_cpu_offload() +``` + +4. 기본적으로 text-to-image 파이프라인은 [`DDIMScheduler`]를 사용하지만, [`DDPMScheduler`]와 같은 다른 스케줄러로 대체하여 추론 속도와 이미지 품질 간의 균형에 어떤 영향을 미치는지 확인할 수 있습니다: + +```py +from diffusers import DDPMScheduler +from diffusers import DiffusionPipeline + +scheduler = DDPMScheduler.from_pretrained("kandinsky-community/kandinsky-2-1", subfolder="ddpm_scheduler") +pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", scheduler=scheduler, torch_dtype=torch.float16, use_safetensors=True).to("cuda") +``` diff --git a/docs/source/ko/using-diffusers/loading_adapters.md b/docs/source/ko/using-diffusers/loading_adapters.md new file mode 100644 index 000000000000..e8b8fcb506a0 --- /dev/null +++ b/docs/source/ko/using-diffusers/loading_adapters.md @@ -0,0 +1,359 @@ + + +# 어댑터 불러오기 + +[[open-in-colab]] + +특정 물체의 이미지 또는 특정 스타일의 이미지를 생성하도록 diffusion 모델을 개인화하기 위한 몇 가지 [학습](../training/overview) 기법이 있습니다. 이러한 학습 방법은 각각 다른 유형의 어댑터를 생성합니다. 일부 어댑터는 완전히 새로운 모델을 생성하는 반면, 다른 어댑터는 임베딩 또는 가중치의 작은 부분만 수정합니다. 이는 각 어댑터의 로딩 프로세스도 다르다는 것을 의미합니다. + +이 가이드에서는 DreamBooth, textual inversion 및 LoRA 가중치를 불러오는 방법을 설명합니다. + + + +사용할 체크포인트와 임베딩은 [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer), [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer), [Diffusers Models Gallery](https://huggingface.co/spaces/huggingface-projects/diffusers-gallery)에서 찾아보시기 바랍니다. + + + +## DreamBooth + +[DreamBooth](https://dreambooth.github.io/)는 물체의 여러 이미지에 대한 *diffusion 모델 전체*를 미세 조정하여 새로운 스타일과 설정으로 해당 물체의 이미지를 생성합니다. 이 방법은 모델이 물체 이미지와 연관시키는 방법을 학습하는 프롬프트에 특수 단어를 사용하는 방식으로 작동합니다. 모든 학습 방법 중에서 드림부스는 전체 체크포인트 모델이기 때문에 파일 크기가 가장 큽니다(보통 몇 GB). + +Hergé가 그린 단 10개의 이미지로 학습된 [herge_style](https://huggingface.co/sd-dreambooth-library/herge-style) 체크포인트를 불러와 해당 스타일의 이미지를 생성해 보겠습니다. 이 모델이 작동하려면 체크포인트를 트리거하는 프롬프트에 특수 단어 `herge_style`을 포함시켜야 합니다: + +```py +from diffusers import AutoPipelineForText2Image +import torch + +pipeline = AutoPipelineForText2Image.from_pretrained("sd-dreambooth-library/herge-style", torch_dtype=torch.float16).to("cuda") +prompt = "A cute herge_style brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration" +image = pipeline(prompt).images[0] +image +``` + +
+ +
+ +## Textual inversion + +[Textual inversion](https://textual-inversion.github.io/)은 DreamBooth와 매우 유사하며 몇 개의 이미지만으로 특정 개념(스타일, 개체)을 생성하는 diffusion 모델을 개인화할 수도 있습니다. 이 방법은 프롬프트에 특정 단어를 입력하면 해당 이미지를 나타내는 새로운 임베딩을 학습하고 찾아내는 방식으로 작동합니다. 결과적으로 diffusion 모델 가중치는 동일하게 유지되고 훈련 프로세스는 비교적 작은(수 KB) 파일을 생성합니다. + +Textual inversion은 임베딩을 생성하기 때문에 DreamBooth처럼 단독으로 사용할 수 없으며 또 다른 모델이 필요합니다. + +```py +from diffusers import AutoPipelineForText2Image +import torch + +pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda") +``` + +이제 [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] 메서드를 사용하여 textual inversion 임베딩을 불러와 이미지를 생성할 수 있습니다. [sd-concepts-library/gta5-artwork](https://huggingface.co/sd-concepts-library/gta5-artwork) 임베딩을 불러와 보겠습니다. 이를 트리거하려면 프롬프트에 특수 단어 ``를 포함시켜야 합니다: + +```py +pipeline.load_textual_inversion("sd-concepts-library/gta5-artwork") +prompt = "A cute brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration, style" +image = pipeline(prompt).images[0] +image +``` + +
+ +
+ +Textual inversion은 또한 바람직하지 않은 사물에 대해 *네거티브 임베딩*을 생성하여 모델이 흐릿한 이미지나 손의 추가 손가락과 같은 바람직하지 않은 사물이 포함된 이미지를 생성하지 못하도록 학습할 수도 있습니다. 이는 프롬프트를 빠르게 개선하는 것이 쉬운 방법이 될 수 있습니다. 이는 이전과 같이 임베딩을 [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`]으로 불러오지만 이번에는 두 개의 매개변수가 더 필요합니다: + +- `weight_name`: 파일이 특정 이름의 🤗 Diffusers 형식으로 저장된 경우이거나 파일이 A1111 형식으로 저장된 경우, 불러올 가중치 파일을 지정합니다. +- `token`: 임베딩을 트리거하기 위해 프롬프트에서 사용할 특수 단어를 지정합니다. + +[sayakpaul/EasyNegative-test](https://huggingface.co/sayakpaul/EasyNegative-test) 임베딩을 불러와 보겠습니다: + +```py +pipeline.load_textual_inversion( + "sayakpaul/EasyNegative-test", weight_name="EasyNegative.safetensors", token="EasyNegative" +) +``` + +이제 `token`을 사용해 네거티브 임베딩이 있는 이미지를 생성할 수 있습니다: + +```py +prompt = "A cute brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration, EasyNegative" +negative_prompt = "EasyNegative" + +image = pipeline(prompt, negative_prompt=negative_prompt, num_inference_steps=50).images[0] +image +``` + +
+ +
+ +## LoRA + +[Low-Rank Adaptation (LoRA)](https://huggingface.co/papers/2106.09685)은 속도가 빠르고 파일 크기가 (수백 MB로) 작기 때문에 널리 사용되는 학습 기법입니다. 이 가이드의 다른 방법과 마찬가지로, LoRA는 몇 장의 이미지만으로 새로운 스타일을 학습하도록 모델을 학습시킬 수 있습니다. 이는 diffusion 모델에 새로운 가중치를 삽입한 다음 전체 모델 대신 새로운 가중치만 학습시키는 방식으로 작동합니다. 따라서 LoRA를 더 빠르게 학습시키고 더 쉽게 저장할 수 있습니다. + + + +LoRA는 다른 학습 방법과 함께 사용할 수 있는 매우 일반적인 학습 기법입니다. 예를 들어, DreamBooth와 LoRA로 모델을 학습하는 것이 일반적입니다. 또한 새롭고 고유한 이미지를 생성하기 위해 여러 개의 LoRA를 불러오고 병합하는 것이 점점 더 일반화되고 있습니다. 병합은 이 불러오기 가이드의 범위를 벗어나므로 자세한 내용은 심층적인 [LoRA 병합](merge_loras) 가이드에서 확인할 수 있습니다. + + + +LoRA는 다른 모델과 함께 사용해야 합니다: + +```py +from diffusers import AutoPipelineForText2Image +import torch + +pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda") +``` + +그리고 [`~loaders.LoraLoaderMixin.load_lora_weights`] 메서드를 사용하여 [ostris/super-cereal-sdxl-lora](https://huggingface.co/ostris/super-cereal-sdxl-lora) 가중치를 불러오고 리포지토리에서 가중치 파일명을 지정합니다: + +```py +pipeline.load_lora_weights("ostris/super-cereal-sdxl-lora", weight_name="cereal_box_sdxl_v1.safetensors") +prompt = "bears, pizza bites" +image = pipeline(prompt).images[0] +image +``` + +
+ +
+ +[`~loaders.LoraLoaderMixin.load_lora_weights`] 메서드는 LoRA 가중치를 UNet과 텍스트 인코더에 모두 불러옵니다. 이 메서드는 해당 케이스에서 LoRA를 불러오는 데 선호되는 방식입니다: + +- LoRA 가중치에 UNet 및 텍스트 인코더에 대한 별도의 식별자가 없는 경우 +- LoRA 가중치에 UNet과 텍스트 인코더에 대한 별도의 식별자가 있는 경우 + +하지만 LoRA 가중치만 UNet에 로드해야 하는 경우에는 [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`] 메서드를 사용할 수 있습니다. [jbilcke-hf/sdxl-cinematic-1](https://huggingface.co/jbilcke-hf/sdxl-cinematic-1) LoRA를 불러와 보겠습니다: + +```py +from diffusers import AutoPipelineForText2Image +import torch + +pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda") +pipeline.unet.load_attn_procs("jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors") + +# 프롬프트에서 cnmt를 사용하여 LoRA를 트리거합니다. +prompt = "A cute cnmt eating a slice of pizza, stunning color scheme, masterpiece, illustration" +image = pipeline(prompt).images[0] +image +``` + +
+ +
+ +LoRA 가중치를 언로드하려면 [`~loaders.LoraLoaderMixin.unload_lora_weights`] 메서드를 사용하여 LoRA 가중치를 삭제하고 모델을 원래 가중치로 복원합니다: + +```py +pipeline.unload_lora_weights() +``` + +### LoRA 가중치 스케일 조정하기 + +[`~loaders.LoraLoaderMixin.load_lora_weights`] 및 [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`] 모두 `cross_attention_kwargs={"scale": 0.5}` 파라미터를 전달하여 얼마나 LoRA 가중치를 사용할지 조정할 수 있습니다. 값이 `0`이면 기본 모델 가중치만 사용하는 것과 같고, 값이 `1`이면 완전히 미세 조정된 LoRA를 사용하는 것과 같습니다. + +레이어당 사용되는 LoRA 가중치의 양을 보다 세밀하게 제어하려면 [`~loaders.LoraLoaderMixin.set_adapters`]를 사용하여 각 레이어의 가중치를 얼마만큼 조정할지 지정하는 딕셔너리를 전달할 수 있습니다. +```python +pipe = ... # 파이프라인 생성 +pipe.load_lora_weights(..., adapter_name="my_adapter") +scales = { + "text_encoder": 0.5, + "text_encoder_2": 0.5, # 파이프에 두 번째 텍스트 인코더가 있는 경우에만 사용 가능 + "unet": { + "down": 0.9, # down 부분의 모든 트랜스포머는 스케일 0.9를 사용 + # "mid" # 이 예제에서는 "mid"가 지정되지 않았으므로 중간 부분의 모든 트랜스포머는 기본 스케일 1.0을 사용 + "up": { + "block_0": 0.6, # # up의 0번째 블록에 있는 3개의 트랜스포머는 모두 스케일 0.6을 사용 + "block_1": [0.4, 0.8, 1.0], # up의 첫 번째 블록에 있는 3개의 트랜스포머는 각각 스케일 0.4, 0.8, 1.0을 사용 + } + } +} +pipe.set_adapters("my_adapter", scales) +``` + +이는 여러 어댑터에서도 작동합니다. 방법은 [이 가이드](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#customize-adapters-strength)를 참조하세요. + + + +현재 [`~loaders.LoraLoaderMixin.set_adapters`]는 어텐션 가중치의 스케일링만 지원합니다. LoRA에 다른 부분(예: resnets or down-/upsamplers)이 있는 경우 1.0의 스케일을 유지합니다. + + + +### Kohya와 TheLastBen + +커뮤니티에서 인기 있는 다른 LoRA trainer로는 [Kohya](https://github.com/kohya-ss/sd-scripts/)와 [TheLastBen](https://github.com/TheLastBen/fast-stable-diffusion)의 trainer가 있습니다. 이 trainer들은 🤗 Diffusers가 훈련한 것과는 다른 LoRA 체크포인트를 생성하지만, 같은 방식으로 불러올 수 있습니다. + + + + +Kohya LoRA를 불러오기 위해, 예시로 [Civitai](https://civitai.com/)에서 [Blueprintify SD XL 1.0](https://civitai.com/models/150986/blueprintify-sd-xl-10) 체크포인트를 다운로드합니다: + +```sh +!wget https://civitai.com/api/download/models/168776 -O blueprintify-sd-xl-10.safetensors +``` + +LoRA 체크포인트를 [`~loaders.LoraLoaderMixin.load_lora_weights`] 메서드로 불러오고 `weight_name` 파라미터에 파일명을 지정합니다: + +```py +from diffusers import AutoPipelineForText2Image +import torch + +pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda") +pipeline.load_lora_weights("path/to/weights", weight_name="blueprintify-sd-xl-10.safetensors") +``` + +이미지를 생성합니다: + +```py +# LoRA를 트리거하기 위해 bl3uprint를 프롬프트에 사용 +prompt = "bl3uprint, a highly detailed blueprint of the eiffel tower, explaining how to build all parts, many txt, blueprint grid backdrop" +image = pipeline(prompt).images[0] +image +``` + + + +Kohya LoRA를 🤗 Diffusers와 함께 사용할 때 몇 가지 제한 사항이 있습니다: + +- [여기](https://github.com/huggingface/diffusers/pull/4287/#issuecomment-1655110736)에 설명된 여러 가지 이유로 인해 이미지가 ComfyUI와 같은 UI에서 생성된 이미지와 다르게 보일 수 있습니다. +- [LyCORIS 체크포인트](https://github.com/KohakuBlueleaf/LyCORIS)가 완전히 지원되지 않습니다. [`~loaders.LoraLoaderMixin.load_lora_weights`] 메서드는 LoRA 및 LoCon 모듈로 LyCORIS 체크포인트를 불러올 수 있지만, Hada 및 LoKR은 지원되지 않습니다. + + + + + + +TheLastBen에서 체크포인트를 불러오는 방법은 매우 유사합니다. 예를 들어, [TheLastBen/William_Eggleston_Style_SDXL](https://huggingface.co/TheLastBen/William_Eggleston_Style_SDXL) 체크포인트를 불러오려면: + +```py +from diffusers import AutoPipelineForText2Image +import torch + +pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda") +pipeline.load_lora_weights("TheLastBen/William_Eggleston_Style_SDXL", weight_name="wegg.safetensors") + +# LoRA를 트리거하기 위해 william eggleston를 프롬프트에 사용 +prompt = "a house by william eggleston, sunrays, beautiful, sunlight, sunrays, beautiful" +image = pipeline(prompt=prompt).images[0] +image +``` + + + + +## IP-Adapter + +[IP-Adapter](https://ip-adapter.github.io/)는 모든 diffusion 모델에 이미지 프롬프트를 사용할 수 있는 경량 어댑터입니다. 이 어댑터는 이미지와 텍스트 feature의 cross-attention 레이어를 분리하여 작동합니다. 다른 모든 모델 컴포넌트튼 freeze되고 UNet의 embedded 이미지 features만 학습됩니다. 따라서 IP-Adapter 파일은 일반적으로 최대 100MB에 불과합니다. + +다양한 작업과 구체적인 사용 사례에 IP-Adapter를 사용하는 방법에 대한 자세한 내용은 [IP-Adapter](../using-diffusers/ip_adapter) 가이드에서 확인할 수 있습니다. + +> [!TIP] +> Diffusers는 현재 가장 많이 사용되는 일부 파이프라인에 대해서만 IP-Adapter를 지원합니다. 멋진 사용 사례가 있는 지원되지 않는 파이프라인에 IP-Adapter를 통합하고 싶다면 언제든지 기능 요청을 여세요! +> 공식 IP-Adapter 체크포인트는 [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter)에서 확인할 수 있습니다. + +시작하려면 Stable Diffusion 체크포인트를 불러오세요. + +```py +from diffusers import AutoPipelineForText2Image +import torch +from diffusers.utils import load_image + +pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda") +``` + +그런 다음 IP-Adapter 가중치를 불러와 [`~loaders.IPAdapterMixin.load_ip_adapter`] 메서드를 사용하여 파이프라인에 추가합니다. + +```py +pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin") +``` + +불러온 뒤, 이미지 및 텍스트 프롬프트가 있는 파이프라인을 사용하여 이미지 생성 프로세스를 가이드할 수 있습니다. + +```py +image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png") +generator = torch.Generator(device="cpu").manual_seed(33) +images = pipeline( +    prompt='best quality, high quality, wearing sunglasses', +    ip_adapter_image=image, +    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", +    num_inference_steps=50, +    generator=generator, +).images[0] +images +``` + +
+    +
+ +### IP-Adapter Plus + +IP-Adapter는 이미지 인코더를 사용하여 이미지 feature를 생성합니다. IP-Adapter 리포지토리에 `image_encoder` 하위 폴더가 있는 경우, 이미지 인코더가 자동으로 불러와 파이프라인에 등록됩니다. 그렇지 않은 경우, [`~transformers.CLIPVisionModelWithProjection`] 모델을 사용하여 이미지 인코더를 명시적으로 불러와 파이프라인에 전달해야 합니다. + +이는 ViT-H 이미지 인코더를 사용하는 *IP-Adapter Plus* 체크포인트에 해당하는 케이스입니다. + +```py +from transformers import CLIPVisionModelWithProjection + +image_encoder = CLIPVisionModelWithProjection.from_pretrained( + "h94/IP-Adapter", + subfolder="models/image_encoder", + torch_dtype=torch.float16 +) + +pipeline = AutoPipelineForText2Image.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + image_encoder=image_encoder, + torch_dtype=torch.float16 +).to("cuda") + +pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter-plus_sdxl_vit-h.safetensors") +``` + +### IP-Adapter Face ID 모델 + +IP-Adapter FaceID 모델은 CLIP 이미지 임베딩 대신 `insightface`에서 생성한 이미지 임베딩을 사용하는 실험적인 IP Adapter입니다. 이러한 모델 중 일부는 LoRA를 사용하여 ID 일관성을 개선하기도 합니다. +이러한 모델을 사용하려면 `insightface`와 해당 요구 사항을 모두 설치해야 합니다. + + +InsightFace 사전학습된 모델은 비상업적 연구 목적으로만 사용할 수 있으므로, IP-Adapter-FaceID 모델은 연구 목적으로만 릴리즈되었으며 상업적 용도로는 사용할 수 없습니다. + + +```py +pipeline = AutoPipelineForText2Image.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + torch_dtype=torch.float16 +).to("cuda") + +pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid_sdxl.bin", image_encoder_folder=None) +``` + +두 가지 IP 어댑터 FaceID Plus 모델 중 하나를 사용하려는 경우, 이 모델들은 더 나은 사실감을 얻기 위해 `insightface`와 CLIP 이미지 임베딩을 모두 사용하므로, CLIP 이미지 인코더도 불러와야 합니다. + +```py +from transformers import CLIPVisionModelWithProjection + +image_encoder = CLIPVisionModelWithProjection.from_pretrained( + "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", + torch_dtype=torch.float16, +) + +pipeline = AutoPipelineForText2Image.from_pretrained( + "runwayml/stable-diffusion-v1-5", + image_encoder=image_encoder, + torch_dtype=torch.float16 +).to("cuda") + +pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid-plus_sd15.bin") +``` diff --git a/docs/source/ko/using-diffusers/loading_overview.md b/docs/source/ko/using-diffusers/loading_overview.md deleted file mode 100644 index d2499f9ee4cf..000000000000 --- a/docs/source/ko/using-diffusers/loading_overview.md +++ /dev/null @@ -1,18 +0,0 @@ - - -# Overview - -🧨 Diffusers는 생성 작업을 위한 다양한 파이프라인, 모델, 스케줄러를 제공합니다. 이러한 컴포넌트를 최대한 간단하게 로드할 수 있도록 단일 통합 메서드인 `from_pretrained()`를 제공하여 Hugging Face [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) 또는 로컬 머신에서 이러한 컴포넌트를 불러올 수 있습니다. 파이프라인이나 모델을 로드할 때마다, 최신 파일이 자동으로 다운로드되고 캐시되므로, 다음에 파일을 다시 다운로드하지 않고도 빠르게 재사용할 수 있습니다. - -이 섹션은 파이프라인 로딩, 파이프라인에서 다양한 컴포넌트를 로드하는 방법, 체크포인트 variants를 불러오는 방법, 그리고 커뮤니티 파이프라인을 불러오는 방법에 대해 알아야 할 모든 것들을 다룹니다. 또한 스케줄러를 불러오는 방법과 서로 다른 스케줄러를 사용할 때 발생하는 속도와 품질간의 트레이드 오프를 비교하는 방법 역시 다룹니다. 그리고 마지막으로 🧨 Diffusers와 함께 파이토치에서 사용할 수 있도록 KerasCV 체크포인트를 변환하고 불러오는 방법을 살펴봅니다. - diff --git a/docs/source/ko/using-diffusers/other-formats.md b/docs/source/ko/using-diffusers/other-formats.md index 3e05228e4548..530b2ea90a5f 100644 --- a/docs/source/ko/using-diffusers/other-formats.md +++ b/docs/source/ko/using-diffusers/other-formats.md @@ -127,7 +127,7 @@ image = pipeline(prompt, num_inference_steps=50).images[0] [Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) (A1111)은 Stable Diffusion을 위해 널리 사용되는 웹 UI로, [Civitai](https://civitai.com/) 와 같은 모델 공유 플랫폼을 지원합니다. 특히 LoRA 기법으로 학습된 모델은 학습 속도가 빠르고 완전히 파인튜닝된 모델보다 파일 크기가 훨씬 작기 때문에 인기가 높습니다. -🤗 Diffusers는 [`~loaders.LoraLoaderMixin.load_lora_weights`]:를 사용하여 A1111 LoRA 체크포인트 불러오기를 지원합니다: +🤗 Diffusers는 [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]:를 사용하여 A1111 LoRA 체크포인트 불러오기를 지원합니다: ```py from diffusers import DiffusionPipeline, UniPCMultistepScheduler diff --git a/docs/source/ko/using-diffusers/pipeline_overview.md b/docs/source/ko/using-diffusers/pipeline_overview.md deleted file mode 100644 index a2793eb2a6ba..000000000000 --- a/docs/source/ko/using-diffusers/pipeline_overview.md +++ /dev/null @@ -1,17 +0,0 @@ - - -# Overview - -파이프라인은 독립적으로 훈련된 모델과 스케줄러를 함께 모아서 추론을 위해 diffusion 시스템을 빠르고 쉽게 사용할 수 있는 방법을 제공하는 end-to-end 클래스입니다. 모델과 스케줄러의 특정 조합은 특수한 기능과 함께 [`StableDiffusionPipeline`] 또는 [`StableDiffusionControlNetPipeline`]과 같은 특정 파이프라인 유형을 정의합니다. 모든 파이프라인 유형은 기본 [`DiffusionPipeline`] 클래스에서 상속됩니다. 어느 체크포인트를 전달하면, 파이프라인 유형을 자동으로 감지하고 필요한 구성 요소들을 불러옵니다. - -이 섹션에서는 unconditional 이미지 생성, text-to-image 생성의 다양한 테크닉과 변화를 파이프라인에서 지원하는 작업들을 소개합니다. 프롬프트에 있는 특정 단어가 출력에 영향을 미치는 것을 조정하기 위해 재현성을 위한 시드 설정과 프롬프트에 가중치를 부여하는 것으로 생성 프로세스를 더 잘 제어하는 방법에 대해 배울 수 있습니다. 마지막으로 음성에서부터 이미지 생성과 같은 커스텀 작업을 위한 커뮤니티 파이프라인을 만드는 방법을 알 수 있습니다. diff --git a/docs/source/ko/using-diffusers/push_to_hub.md b/docs/source/ko/using-diffusers/push_to_hub.md new file mode 100644 index 000000000000..c97a1150c8ab --- /dev/null +++ b/docs/source/ko/using-diffusers/push_to_hub.md @@ -0,0 +1,177 @@ + + +# 파일들을 Hub로 푸시하기 + +[[open-in-colab]] + +🤗 Diffusers는 모델, 스케줄러 또는 파이프라인을 Hub에 업로드할 수 있는 [`~diffusers.utils.PushToHubMixin`]을 제공합니다. 이는 Hub에 당신의 파일을 저장하는 쉬운 방법이며, 다른 사람들과 작업을 공유할 수도 있습니다. 실제적으로 [`~diffusers.utils.PushToHubMixin`]가 동작하는 방식은 다음과 같습니다: + +1. Hub에 리포지토리를 생성합니다. +2. 나중에 다시 불러올 수 있도록 모델, 스케줄러 또는 파이프라인 파일을 저장합니다. +3. 이러한 파일이 포함된 폴더를 Hub에 업로드합니다. + +이 가이드는 [`~diffusers.utils.PushToHubMixin`]을 사용하여 Hub에 파일을 업로드하는 방법을 보여줍니다. + +먼저 액세스 [토큰](https://huggingface.co/settings/tokens)으로 Hub 계정에 로그인해야 합니다: + +```py +from huggingface_hub import notebook_login + +notebook_login() +``` + +## 모델 + +모델을 허브에 푸시하려면 [`~diffusers.utils.PushToHubMixin.push_to_hub`]를 호출하고 Hub에 저장할 모델의 리포지토리 id를 지정합니다: + +```py +from diffusers import ControlNetModel + +controlnet = ControlNetModel( + block_out_channels=(32, 64), + layers_per_block=2, + in_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + cross_attention_dim=32, + conditioning_embedding_out_channels=(16, 32), +) +controlnet.push_to_hub("my-controlnet-model") +``` + +모델의 경우 Hub에 푸시할 가중치의 [*변형*](loading#checkpoint-variants)을 지정할 수도 있습니다. 예를 들어, `fp16` 가중치를 푸시하려면 다음과 같이 하세요: + +```py +controlnet.push_to_hub("my-controlnet-model", variant="fp16") +``` + +[`~diffusers.utils.PushToHubMixin.push_to_hub`] 함수는 모델의 `config.json` 파일을 저장하고 가중치는 `safetensors` 형식으로 자동으로 저장됩니다. + +이제 Hub의 리포지토리에서 모델을 다시 불러올 수 있습니다: + +```py +model = ControlNetModel.from_pretrained("your-namespace/my-controlnet-model") +``` + +## 스케줄러 + +스케줄러를 허브에 푸시하려면 [`~diffusers.utils.PushToHubMixin.push_to_hub`]를 호출하고 Hub에 저장할 스케줄러의 리포지토리 id를 지정합니다: + +```py +from diffusers import DDIMScheduler + +scheduler = DDIMScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, +) +scheduler.push_to_hub("my-controlnet-scheduler") +``` + +[`~diffusers.utils.PushToHubMixin.push_to_hub`] 함수는 스케줄러의 `scheduler_config.json` 파일을 지정된 리포지토리에 저장합니다. + +이제 허브의 리포지토리에서 스케줄러를 다시 불러올 수 있습니다: + +```py +scheduler = DDIMScheduler.from_pretrained("your-namepsace/my-controlnet-scheduler") +``` + +## 파이프라인 + +모든 컴포넌트가 포함된 전체 파이프라인을 Hub로 푸시할 수도 있습니다. 예를 들어, 원하는 파라미터로 [`StableDiffusionPipeline`]의 컴포넌트들을 초기화합니다: + +```py +from diffusers import ( + UNet2DConditionModel, + AutoencoderKL, + DDIMScheduler, + StableDiffusionPipeline, +) +from transformers import CLIPTextModel, CLIPTextConfig, CLIPTokenizer + +unet = UNet2DConditionModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=4, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=32, +) + +scheduler = DDIMScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, +) + +vae = AutoencoderKL( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, +) + +text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, +) +text_encoder = CLIPTextModel(text_encoder_config) +tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") +``` + +모든 컴포넌트들을 [`StableDiffusionPipeline`]에 전달하고 [`~diffusers.utils.PushToHubMixin.push_to_hub`]를 호출하여 파이프라인을 Hub로 푸시합니다: + +```py +components = { + "unet": unet, + "scheduler": scheduler, + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "safety_checker": None, + "feature_extractor": None, +} + +pipeline = StableDiffusionPipeline(**components) +pipeline.push_to_hub("my-pipeline") +``` + +[`~diffusers.utils.PushToHubMixin.push_to_hub`] 함수는 각 컴포넌트를 리포지토리의 하위 폴더에 저장합니다. 이제 Hub의 리포지토리에서 파이프라인을 다시 불러올 수 있습니다: + +```py +pipeline = StableDiffusionPipeline.from_pretrained("your-namespace/my-pipeline") +``` + +## 비공개 + +모델, 스케줄러 또는 파이프라인 파일들을 비공개로 두려면 [`~diffusers.utils.PushToHubMixin.push_to_hub`] 함수에서 `private=True`를 설정하세요: + +```py +controlnet.push_to_hub("my-controlnet-model-private", private=True) +``` + +비공개 리포지토리는 본인만 볼 수 있으며 다른 사용자는 리포지토리를 복제할 수 없고 리포지토리가 검색 결과에 표시되지 않습니다. 사용자가 비공개 리포지토리의 URL을 가지고 있더라도 `404 - Sorry, we can't find the page you are looking for`라는 메시지가 표시됩니다. 비공개 리포지토리에서 모델을 로드하려면 [로그인](https://huggingface.co/docs/huggingface_hub/quick-start#login) 상태여야 합니다. \ No newline at end of file diff --git a/docs/source/ko/using-diffusers/reproducibility.md b/docs/source/ko/using-diffusers/reproducibility.md deleted file mode 100644 index cdb5fb84b92f..000000000000 --- a/docs/source/ko/using-diffusers/reproducibility.md +++ /dev/null @@ -1,201 +0,0 @@ - - -# 재현 가능한 파이프라인 생성하기 - -[[open-in-colab]] - -재현성은 테스트, 결과 재현, 그리고 [이미지 퀄리티 높이기](resuing_seeds)에서 중요합니다. -그러나 diffusion 모델의 무작위성은 매번 모델이 돌아갈 때마다 파이프라인이 다른 이미지를 생성할 수 있도록 하는 이유로 필요합니다. -플랫폼 간에 정확하게 동일한 결과를 얻을 수는 없지만, 특정 허용 범위 내에서 릴리스 및 플랫폼 간에 결과를 재현할 수는 있습니다. -그럼에도 diffusion 파이프라인과 체크포인트에 따라 허용 오차가 달라집니다. - -diffusion 모델에서 무작위성의 원천을 제어하거나 결정론적 알고리즘을 사용하는 방법을 이해하는 것이 중요한 이유입니다. - - - -💡 Pytorch의 [재현성에 대한 선언](https://pytorch.org/docs/stable/notes/randomness.html)를 꼭 읽어보길 추천합니다: - -> 완전하게 재현가능한 결과는 Pytorch 배포, 개별적인 커밋, 혹은 다른 플랫폼들에서 보장되지 않습니다. -> 또한, 결과는 CPU와 GPU 실행간에 심지어 같은 seed를 사용할 때도 재현 가능하지 않을 수 있습니다. - - - -## 무작위성 제어하기 - -추론에서, 파이프라인은 노이즈를 줄이기 위해 가우시안 노이즈를 생성하거나 스케줄링 단계에 노이즈를 더하는 등의 랜덤 샘플링 실행에 크게 의존합니다, - -[DDIMPipeline](https://huggingface.co/docs/diffusers/v0.18.0/en/api/pipelines/ddim#diffusers.DDIMPipeline)에서 두 추론 단계 이후의 텐서 값을 살펴보세요: - -```python -from diffusers import DDIMPipeline -import numpy as np - -model_id = "google/ddpm-cifar10-32" - -# 모델과 스케줄러를 불러오기 -ddim = DDIMPipeline.from_pretrained(model_id) - -# 두 개의 단계에 대해서 파이프라인을 실행하고 numpy tensor로 값을 반환하기 -image = ddim(num_inference_steps=2, output_type="np").images -print(np.abs(image).sum()) -``` - -위의 코드를 실행하면 하나의 값이 나오지만, 다시 실행하면 다른 값이 나옵니다. 무슨 일이 일어나고 있는 걸까요? - -파이프라인이 실행될 때마다, [torch.randn](https://pytorch.org/docs/stable/generated/torch.randn.html)은 -단계적으로 노이즈 제거되는 가우시안 노이즈가 생성하기 위한 다른 랜덤 seed를 사용합니다. - -그러나 동일한 이미지를 안정적으로 생성해야 하는 경우에는 CPU에서 파이프라인을 실행하는지 GPU에서 실행하는지에 따라 달라집니다. - -### CPU - -CPU에서 재현 가능한 결과를 생성하려면, PyTorch [Generator](https://pytorch.org/docs/stable/generated/torch.randn.html)로 seed를 고정합니다: - -```python -import torch -from diffusers import DDIMPipeline -import numpy as np - -model_id = "google/ddpm-cifar10-32" - -# 모델과 스케줄러 불러오기 -ddim = DDIMPipeline.from_pretrained(model_id) - -# 재현성을 위해 generator 만들기 -generator = torch.Generator(device="cpu").manual_seed(0) - -# 두 개의 단계에 대해서 파이프라인을 실행하고 numpy tensor로 값을 반환하기 -image = ddim(num_inference_steps=2, output_type="np", generator=generator).images -print(np.abs(image).sum()) -``` - -이제 위의 코드를 실행하면 seed를 가진 `Generator` 객체가 파이프라인의 모든 랜덤 함수에 전달되므로 항상 `1491.1711` 값이 출력됩니다. - -특정 하드웨어 및 PyTorch 버전에서 이 코드 예제를 실행하면 동일하지는 않더라도 유사한 결과를 얻을 수 있습니다. - - - -💡 처음에는 시드를 나타내는 정수값 대신에 `Generator` 개체를 파이프라인에 전달하는 것이 약간 비직관적일 수 있지만, -`Generator`는 순차적으로 여러 파이프라인에 전달될 수 있는 \랜덤상태\이기 때문에 PyTorch에서 확률론적 모델을 다룰 때 권장되는 설계입니다. - - - -### GPU - -예를 들면, GPU 상에서 같은 코드 예시를 실행하면: - -```python -import torch -from diffusers import DDIMPipeline -import numpy as np - -model_id = "google/ddpm-cifar10-32" - -# 모델과 스케줄러 불러오기 -ddim = DDIMPipeline.from_pretrained(model_id) -ddim.to("cuda") - -# 재현성을 위한 generator 만들기 -generator = torch.Generator(device="cuda").manual_seed(0) - -# 두 개의 단계에 대해서 파이프라인을 실행하고 numpy tensor로 값을 반환하기 -image = ddim(num_inference_steps=2, output_type="np", generator=generator).images -print(np.abs(image).sum()) -``` - -GPU가 CPU와 다른 난수 생성기를 사용하기 때문에 동일한 시드를 사용하더라도 결과가 같지 않습니다. - -이 문제를 피하기 위해 🧨 Diffusers는 CPU에 임의의 노이즈를 생성한 다음 필요에 따라 텐서를 GPU로 이동시키는 -[randn_tensor()](https://huggingface.co/docs/diffusers/v0.18.0/en/api/utilities#diffusers.utils.randn_tensor)기능을 가지고 있습니다. -`randn_tensor` 기능은 파이프라인 내부 어디에서나 사용되므로 파이프라인이 GPU에서 실행되더라도 **항상** CPU `Generator`를 통과할 수 있습니다. - -이제 결과에 훨씬 더 다가왔습니다! - -```python -import torch -from diffusers import DDIMPipeline -import numpy as np - -model_id = "google/ddpm-cifar10-32" - -# 모델과 스케줄러 불러오기 -ddim = DDIMPipeline.from_pretrained(model_id) -ddim.to("cuda") - -#재현성을 위한 generator 만들기 (GPU에 올리지 않도록 조심한다!) -generator = torch.manual_seed(0) - -# 두 개의 단계에 대해서 파이프라인을 실행하고 numpy tensor로 값을 반환하기 -image = ddim(num_inference_steps=2, output_type="np", generator=generator).images -print(np.abs(image).sum()) -``` - - - -💡 재현성이 중요한 경우에는 항상 CPU generator를 전달하는 것이 좋습니다. -성능 손실은 무시할 수 없는 경우가 많으며 파이프라인이 GPU에서 실행되었을 때보다 훨씬 더 비슷한 값을 생성할 수 있습니다. - - - -마지막으로 [UnCLIPPipeline](https://huggingface.co/docs/diffusers/v0.18.0/en/api/pipelines/unclip#diffusers.UnCLIPPipeline)과 같은 -더 복잡한 파이프라인의 경우, 이들은 종종 정밀 오차 전파에 극도로 취약합니다. -다른 GPU 하드웨어 또는 PyTorch 버전에서 유사한 결과를 기대하지 마세요. -이 경우 완전한 재현성을 위해 완전히 동일한 하드웨어 및 PyTorch 버전을 실행해야 합니다. - -## 결정론적 알고리즘 - -결정론적 알고리즘을 사용하여 재현 가능한 파이프라인을 생성하도록 PyTorch를 구성할 수도 있습니다. -그러나 결정론적 알고리즘은 비결정론적 알고리즘보다 느리고 성능이 저하될 수 있습니다. -하지만 재현성이 중요하다면, 이것이 최선의 방법입니다! - -둘 이상의 CUDA 스트림에서 작업이 시작될 때 비결정론적 동작이 발생합니다. -이 문제를 방지하려면 환경 변수 [CUBLAS_WORKSPACE_CONFIG](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility)를 `:16:8`로 설정해서 -런타임 중에 오직 하나의 버퍼 크리만 사용하도록 설정합니다. - -PyTorch는 일반적으로 가장 빠른 알고리즘을 선택하기 위해 여러 알고리즘을 벤치마킹합니다. -하지만 재현성을 원하는 경우, 벤치마크가 매 순간 다른 알고리즘을 선택할 수 있기 때문에 이 기능을 사용하지 않도록 설정해야 합니다. -마지막으로, [torch.use_deterministic_algorithms](https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html)에 -`True`를 통과시켜 결정론적 알고리즘이 활성화 되도록 합니다. - -```py -import os - -os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" - -torch.backends.cudnn.benchmark = False -torch.use_deterministic_algorithms(True) -``` - -이제 동일한 파이프라인을 두번 실행하면 동일한 결과를 얻을 수 있습니다. - -```py -import torch -from diffusers import DDIMScheduler, StableDiffusionPipeline -import numpy as np - -model_id = "runwayml/stable-diffusion-v1-5" -pipe = StableDiffusionPipeline.from_pretrained(model_id).to("cuda") -pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) -g = torch.Generator(device="cuda") - -prompt = "A bear is playing a guitar on Times Square" - -g.manual_seed(0) -result1 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images - -g.manual_seed(0) -result2 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images - -print("L_inf dist = ", abs(result1 - result2).max()) -"L_inf dist = tensor(0., device='cuda:0')" -``` diff --git a/docs/source/ko/using-diffusers/reusing_seeds.md b/docs/source/ko/using-diffusers/reusing_seeds.md deleted file mode 100644 index f6afdf6535a5..000000000000 --- a/docs/source/ko/using-diffusers/reusing_seeds.md +++ /dev/null @@ -1,63 +0,0 @@ - - -# Deterministic(결정적) 생성을 통한 이미지 품질 개선 - -생성된 이미지의 품질을 개선하는 일반적인 방법은 *결정적 batch(배치) 생성*을 사용하는 것입니다. 이 방법은 이미지 batch(배치)를 생성하고 두 번째 추론 라운드에서 더 자세한 프롬프트와 함께 개선할 이미지 하나를 선택하는 것입니다. 핵심은 일괄 이미지 생성을 위해 파이프라인에 [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html#generator) 목록을 전달하고, 각 `Generator`를 시드에 연결하여 이미지에 재사용할 수 있도록 하는 것입니다. - -예를 들어 [`runwayml/stable-diffusion-v1-5`](runwayml/stable-diffusion-v1-5)를 사용하여 다음 프롬프트의 여러 버전을 생성해 봅시다. - -```py -prompt = "Labrador in the style of Vermeer" -``` - -(가능하다면) 파이프라인을 [`DiffusionPipeline.from_pretrained`]로 인스턴스화하여 GPU에 배치합니다. - -```python ->>> from diffusers import DiffusionPipeline - ->>> pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) ->>> pipe = pipe.to("cuda") -``` - -이제 네 개의 서로 다른 `Generator`를 정의하고 각 `Generator`에 시드(`0` ~ `3`)를 할당하여 나중에 특정 이미지에 대해 `Generator`를 재사용할 수 있도록 합니다. - -```python ->>> import torch - ->>> generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(4)] -``` - -이미지를 생성하고 살펴봅니다. - -```python ->>> images = pipe(prompt, generator=generator, num_images_per_prompt=4).images ->>> images -``` - -![img](https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds.jpg) - -이 예제에서는 첫 번째 이미지를 개선했지만 실제로는 원하는 모든 이미지를 사용할 수 있습니다(심지어 두 개의 눈이 있는 이미지도!). 첫 번째 이미지에서는 시드가 '0'인 '생성기'를 사용했기 때문에 두 번째 추론 라운드에서는 이 '생성기'를 재사용할 것입니다. 이미지의 품질을 개선하려면 프롬프트에 몇 가지 텍스트를 추가합니다: - -```python -prompt = [prompt + t for t in [", highly realistic", ", artsy", ", trending", ", colorful"]] -generator = [torch.Generator(device="cuda").manual_seed(0) for i in range(4)] -``` - -시드가 `0`인 제너레이터 4개를 생성하고, 이전 라운드의 첫 번째 이미지처럼 보이는 다른 이미지 batch(배치)를 생성합니다! - -```python ->>> images = pipe(prompt, generator=generator).images ->>> images -``` - -![img](https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds_2.jpg) diff --git a/docs/source/ko/using-diffusers/sdxl_turbo.md b/docs/source/ko/using-diffusers/sdxl_turbo.md new file mode 100644 index 000000000000..766ac0f10a7d --- /dev/null +++ b/docs/source/ko/using-diffusers/sdxl_turbo.md @@ -0,0 +1,114 @@ + + +# Stable Diffusion XL Turbo + +[[open-in-colab]] + +SDXL Turbo는 adversarial time-distilled(적대적 시간 전이) [Stable Diffusion XL](https://huggingface.co/papers/2307.01952)(SDXL) 모델로, 단 한 번의 스텝만으로 추론을 실행할 수 있습니다. + +이 가이드에서는 text-to-image와 image-to-image를 위한 SDXL-Turbo를 사용하는 방법을 설명합니다. + +시작하기 전에 다음 라이브러리가 설치되어 있는지 확인하세요: + +```py +# Colab에서 필요한 라이브러리를 설치하기 위해 주석을 제외하세요 +#!pip install -q diffusers transformers accelerate +``` + +## 모델 체크포인트 불러오기 + +모델 가중치는 Hub의 별도 하위 폴더 또는 로컬에 저장할 수 있으며, 이 경우 [`~StableDiffusionXLPipeline.from_pretrained`] 메서드를 사용해야 합니다: + +```py +from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image +import torch + +pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16") +pipeline = pipeline.to("cuda") +``` + +또한 [`~StableDiffusionXLPipeline.from_single_file`] 메서드를 사용하여 허브 또는 로컬에서 단일 파일 형식(`.ckpt` 또는 `.safetensors`)으로 저장된 모델 체크포인트를 불러올 수도 있습니다: + +```py +from diffusers import StableDiffusionXLPipeline +import torch + +pipeline = StableDiffusionXLPipeline.from_single_file( + "https://huggingface.co/stabilityai/sdxl-turbo/blob/main/sd_xl_turbo_1.0_fp16.safetensors", torch_dtype=torch.float16) +pipeline = pipeline.to("cuda") +``` + +## Text-to-image + +Text-to-image의 경우 텍스트 프롬프트를 전달합니다. 기본적으로 SDXL Turbo는 512x512 이미지를 생성하며, 이 해상도에서 최상의 결과를 제공합니다. `height` 및 `width` 매개 변수를 768x768 또는 1024x1024로 설정할 수 있지만 이 경우 품질 저하를 예상할 수 있습니다. + +모델이 `guidance_scale` 없이 학습되었으므로 이를 0.0으로 설정해 비활성화해야 합니다. 단일 추론 스텝만으로도 고품질 이미지를 생성할 수 있습니다. +스텝 수를 2, 3 또는 4로 늘리면 이미지 품질이 향상됩니다. + +```py +from diffusers import AutoPipelineForText2Image +import torch + +pipeline_text2image = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16") +pipeline_text2image = pipeline_text2image.to("cuda") + +prompt = "A cinematic shot of a baby racoon wearing an intricate italian priest robe." + +image = pipeline_text2image(prompt=prompt, guidance_scale=0.0, num_inference_steps=1).images[0] +image +``` + +
+ generated image of a racoon in a robe +
+ +## Image-to-image + +Image-to-image 생성의 경우 `num_inference_steps * strength`가 1보다 크거나 같은지 확인하세요. +Image-to-image 파이프라인은 아래 예제에서 `0.5 * 2.0 = 1` 스텝과 같이 `int(num_inference_steps * strength)` 스텝으로 실행됩니다. + +```py +from diffusers import AutoPipelineForImage2Image +from diffusers.utils import load_image, make_image_grid + +# 체크포인트를 불러올 때 추가 메모리 소모를 피하려면 from_pipe를 사용하세요. +pipeline = AutoPipelineForImage2Image.from_pipe(pipeline_text2image).to("cuda") + +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png") +init_image = init_image.resize((512, 512)) + +prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k" + +image = pipeline(prompt, image=init_image, strength=0.5, guidance_scale=0.0, num_inference_steps=2).images[0] +make_image_grid([init_image, image], rows=1, cols=2) +``` + +
+ Image-to-image generation sample using SDXL Turbo +
+ +## SDXL Turbo 속도 훨씬 더 빠르게 하기 + +- PyTorch 버전 2 이상을 사용하는 경우 UNet을 컴파일합니다. 첫 번째 추론 실행은 매우 느리지만 이후 실행은 훨씬 빨라집니다. + +```py +pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) +``` + +- 기본 VAE를 사용하는 경우, 각 생성 전후에 비용이 많이 드는 `dtype` 변환을 피하기 위해 `float32`로 유지하세요. 이 작업은 첫 생성 이전에 한 번만 수행하면 됩니다: + +```py +pipe.upcast_vae() +``` + +또는, 커뮤니티 회원인 [`@madebyollin`](https://huggingface.co/madebyollin)이 만든 [16비트 VAE](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)를 사용할 수도 있으며, 이는 `float32`로 업캐스트할 필요가 없습니다. \ No newline at end of file diff --git a/docs/source/ko/using-diffusers/shap-e.md b/docs/source/ko/using-diffusers/shap-e.md new file mode 100644 index 000000000000..abf5a182b3a6 --- /dev/null +++ b/docs/source/ko/using-diffusers/shap-e.md @@ -0,0 +1,192 @@ + + +# Shap-E + +[[open-in-colab]] + +Shap-E는 비디오 게임 개발, 인테리어 디자인, 건축에 사용할 수 있는 3D 에셋을 생성하기 위한 conditional 모델입니다. 대규모 3D 에셋 데이터셋을 학습되었고, 각 오브젝트의 더 많은 뷰를 렌더링하고 4K point cloud 대신 16K를 생성하도록 후처리합니다. Shap-E 모델은 두 단계로 학습됩니다: + +1. 인코더가 3D 에셋의 포인트 클라우드와 렌더링된 뷰를 받아들이고 에셋을 나타내는 implicit functions의 파라미터를 출력합니다. +2. 인코더가 생성한 latents를 바탕으로 diffusion 모델을 훈련하여 neural radiance fields(NeRF) 또는 textured 3D 메시를 생성하여 다운스트림 애플리케이션에서 3D 에셋을 더 쉽게 렌더링하고 사용할 수 있도록 합니다. + +이 가이드에서는 Shap-E를 사용하여 나만의 3D 에셋을 생성하는 방법을 보입니다! + +시작하기 전에 다음 라이브러리가 설치되어 있는지 확인하세요: + +```py +# Colab에서 필요한 라이브러리를 설치하기 위해 주석을 제외하세요 +#!pip install -q diffusers transformers accelerate trimesh +``` + +## Text-to-3D + +3D 객체의 gif를 생성하려면 텍스트 프롬프트를 [`ShapEPipeline`]에 전달합니다. 파이프라인은 3D 객체를 생성하는 데 사용되는 이미지 프레임 리스트를 생성합니다. + +```py +import torch +from diffusers import ShapEPipeline + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16, variant="fp16") +pipe = pipe.to(device) + +guidance_scale = 15.0 +prompt = ["A firecracker", "A birthday cupcake"] + +images = pipe( + prompt, + guidance_scale=guidance_scale, + num_inference_steps=64, + frame_size=256, +).images +``` + +이제 [`~utils.export_to_gif`] 함수를 사용하여 이미지 프레임 리스트를 3D 객체의 gif로 변환합니다. + +```py +from diffusers.utils import export_to_gif + +export_to_gif(images[0], "firecracker_3d.gif") +export_to_gif(images[1], "cake_3d.gif") +``` + +
+
+ +
prompt = "A firecracker"
+
+
+ +
prompt = "A birthday cupcake"
+
+
+ +## Image-to-3D + +다른 이미지로부터 3D 개체를 생성하려면 [`ShapEImg2ImgPipeline`]을 사용합니다. 기존 이미지를 사용하거나 완전히 새로운 이미지를 생성할 수 있습니다. [Kandinsky 2.1](../api/pipelines/kandinsky) 모델을 사용하여 새 이미지를 생성해 보겠습니다. + +```py +from diffusers import DiffusionPipeline +import torch + +prior_pipeline = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16, use_safetensors=True).to("cuda") +pipeline = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16, use_safetensors=True).to("cuda") + +prompt = "A cheeseburger, white background" + +image_embeds, negative_image_embeds = prior_pipeline(prompt, guidance_scale=1.0).to_tuple() +image = pipeline( + prompt, + image_embeds=image_embeds, + negative_image_embeds=negative_image_embeds, +).images[0] + +image.save("burger.png") +``` + +치즈버거를 [`ShapEImg2ImgPipeline`]에 전달하여 3D representation을 생성합니다. + +```py +from PIL import Image +from diffusers import ShapEImg2ImgPipeline +from diffusers.utils import export_to_gif + +pipe = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16, variant="fp16").to("cuda") + +guidance_scale = 3.0 +image = Image.open("burger.png").resize((256, 256)) + +images = pipe( + image, + guidance_scale=guidance_scale, + num_inference_steps=64, + frame_size=256, +).images + +gif_path = export_to_gif(images[0], "burger_3d.gif") +``` + +
+
+ +
cheeseburger
+
+
+ +
3D cheeseburger
+
+
+ +## 메시 생성하기 + +Shap-E는 다운스트림 애플리케이션에 렌더링할 textured 메시 출력을 생성할 수도 있는 유연한 모델입니다. 이 예제에서는 🤗 Datasets 라이브러리에서 [Dataset viewer](https://huggingface.co/docs/hub/datasets-viewer#dataset-preview)를 사용해 메시 시각화를 지원하는 `glb` 파일로 변환합니다. + +`output_type` 매개변수를 `"mesh"`로 지정함으로써 [`ShapEPipeline`]과 [`ShapEImg2ImgPipeline`] 모두에 대한 메시 출력을 생성할 수 있습니다: + +```py +import torch +from diffusers import ShapEPipeline + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16, variant="fp16") +pipe = pipe.to(device) + +guidance_scale = 15.0 +prompt = "A birthday cupcake" + +images = pipe(prompt, guidance_scale=guidance_scale, num_inference_steps=64, frame_size=256, output_type="mesh").images +``` + +메시 출력을 `ply` 파일로 저장하려면 [`~utils.export_to_ply`] 함수를 사용합니다: + + + +선택적으로 [`~utils.export_to_obj`] 함수를 사용하여 메시 출력을 `obj` 파일로 저장할 수 있습니다. 다양한 형식으로 메시 출력을 저장할 수 있어 다운스트림에서 더욱 유연하게 사용할 수 있습니다! + + + +```py +from diffusers.utils import export_to_ply + +ply_path = export_to_ply(images[0], "3d_cake.ply") +print(f"Saved to folder: {ply_path}") +``` + +그 다음 trimesh 라이브러리를 사용하여 `ply` 파일을 `glb` 파일로 변환할 수 있습니다: + +```py +import trimesh + +mesh = trimesh.load("3d_cake.ply") +mesh_export = mesh.export("3d_cake.glb", file_type="glb") +``` + +기본적으로 메시 출력은 아래쪽 시점에 초점이 맞춰져 있지만 회전 변환을 적용하여 기본 시점을 변경할 수 있습니다: + +```py +import trimesh +import numpy as np + +mesh = trimesh.load("3d_cake.ply") +rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0]) +mesh = mesh.apply_transform(rot) +mesh_export = mesh.export("3d_cake.glb", file_type="glb") +``` + +메시 파일을 데이터셋 레포지토리에 업로드해 Dataset viewer로 시각화하세요! + +
+ +
diff --git a/docs/source/ko/using-diffusers/svd.md b/docs/source/ko/using-diffusers/svd.md new file mode 100644 index 000000000000..678e21728ad4 --- /dev/null +++ b/docs/source/ko/using-diffusers/svd.md @@ -0,0 +1,121 @@ + + +# Stable Video Diffusion + +[[open-in-colab]] + +[Stable Video Diffusion (SVD)](https://huggingface.co/papers/2311.15127)은 입력 이미지에 맞춰 2~4초 분량의 고해상도(576x1024) 비디오를 생성할 수 있는 강력한 image-to-video 생성 모델입니다. + +이 가이드에서는 SVD를 사용하여 이미지에서 짧은 동영상을 생성하는 방법을 설명합니다. + +시작하기 전에 다음 라이브러리가 설치되어 있는지 확인하세요: + +```py +!pip install -q -U diffusers transformers accelerate +``` + +이 모델에는 [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid)와 [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) 두 가지 종류가 있습니다. SVD 체크포인트는 14개의 프레임을 생성하도록 학습되었고, SVD-XT 체크포인트는 25개의 프레임을 생성하도록 파인튜닝되었습니다. + +이 가이드에서는 SVD-XT 체크포인트를 사용합니다. + +```python +import torch + +from diffusers import StableVideoDiffusionPipeline +from diffusers.utils import load_image, export_to_video + +pipe = StableVideoDiffusionPipeline.from_pretrained( + "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16" +) +pipe.enable_model_cpu_offload() + +# Conditioning 이미지 불러오기 +image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png") +image = image.resize((1024, 576)) + +generator = torch.manual_seed(42) +frames = pipe(image, decode_chunk_size=8, generator=generator).frames[0] + +export_to_video(frames, "generated.mp4", fps=7) +``` + +
+
+ +
"source image of a rocket"
+
+
+ +
"generated video from source image"
+
+
+ +## torch.compile + +UNet을 [컴파일](../optimization/torch2.0#torchcompile)하면 메모리 사용량이 살짝 증가하지만, 20~25%의 속도 향상을 얻을 수 있습니다. + +```diff +- pipe.enable_model_cpu_offload() ++ pipe.to("cuda") ++ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) +``` + +## 메모리 사용량 줄이기 + +비디오 생성은 기본적으로 배치 크기가 큰 text-to-image 생성과 유사하게 'num_frames'를 한 번에 생성하기 때문에 메모리 사용량이 매우 높습니다. 메모리 사용량을 줄이기 위해 추론 속도와 메모리 사용량을 절충하는 여러 가지 옵션이 있습니다: + +- 모델 오프로링 활성화: 파이프라인의 각 구성 요소가 더 이상 필요하지 않을 때 CPU로 오프로드됩니다. +- Feed-forward chunking 활성화: feed-forward 레이어가 배치 크기가 큰 단일 feed-forward를 실행하는 대신 루프로 반복해서 실행됩니다. +- `decode_chunk_size` 감소: VAE가 프레임들을 한꺼번에 디코딩하는 대신 chunk 단위로 디코딩합니다. `decode_chunk_size=1`을 설정하면 한 번에 한 프레임씩 디코딩하고 최소한의 메모리만 사용하지만(GPU 메모리에 따라 이 값을 조정하는 것이 좋습니다), 동영상에 약간의 깜박임이 발생할 수 있습니다. + +```diff +- pipe.enable_model_cpu_offload() +- frames = pipe(image, decode_chunk_size=8, generator=generator).frames[0] ++ pipe.enable_model_cpu_offload() ++ pipe.unet.enable_forward_chunking() ++ frames = pipe(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0] +``` + +이러한 모든 방법들을 사용하면 메모리 사용량이 8GAM VRAM보다 적을 것입니다. + +## Micro-conditioning + +Stable Diffusion Video는 또한 이미지 conditoning 외에도 micro-conditioning을 허용하므로 생성된 비디오를 더 잘 제어할 수 있습니다: + +- `fps`: 생성된 비디오의 초당 프레임 수입니다. +- `motion_bucket_id`: 생성된 동영상에 사용할 모션 버킷 아이디입니다. 생성된 동영상의 모션을 제어하는 데 사용할 수 있습니다. 모션 버킷 아이디를 늘리면 생성되는 동영상의 모션이 증가합니다. +- `noise_aug_strength`: Conditioning 이미지에 추가되는 노이즈의 양입니다. 값이 클수록 비디오가 conditioning 이미지와 덜 유사해집니다. 이 값을 높이면 생성된 비디오의 움직임도 증가합니다. + +예를 들어, 모션이 더 많은 동영상을 생성하려면 `motion_bucket_id` 및 `noise_aug_strength` micro-conditioning 파라미터를 사용합니다: + +```python +import torch + +from diffusers import StableVideoDiffusionPipeline +from diffusers.utils import load_image, export_to_video + +pipe = StableVideoDiffusionPipeline.from_pretrained( + "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16" +) +pipe.enable_model_cpu_offload() + +# Conditioning 이미지 불러오기 +image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png") +image = image.resize((1024, 576)) + +generator = torch.manual_seed(42) +frames = pipe(image, decode_chunk_size=8, generator=generator, motion_bucket_id=180, noise_aug_strength=0.1).frames[0] +export_to_video(frames, "generated.mp4", fps=7) +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/output_rocket_with_conditions.gif) diff --git a/docs/source/ko/using-diffusers/using_safetensors.md b/docs/source/ko/using-diffusers/using_safetensors.md deleted file mode 100644 index 4e1c6758e13f..000000000000 --- a/docs/source/ko/using-diffusers/using_safetensors.md +++ /dev/null @@ -1,67 +0,0 @@ -# 세이프텐서 로드 - -[safetensors](https://github.com/huggingface/safetensors)는 텐서를 저장하고 로드하기 위한 안전하고 빠른 파일 형식입니다. 일반적으로 PyTorch 모델 가중치는 Python의 [`pickle`](https://docs.python.org/3/library/pickle.html) 유틸리티를 사용하여 `.bin` 파일에 저장되거나 `피클`됩니다. 그러나 `피클`은 안전하지 않으며 피클된 파일에는 실행될 수 있는 악성 코드가 포함될 수 있습니다. 세이프텐서는 `피클`의 안전한 대안으로 모델 가중치를 공유하는 데 이상적입니다. - -이 가이드에서는 `.safetensor` 파일을 로드하는 방법과 다른 형식으로 저장된 안정적 확산 모델 가중치를 `.safetensor`로 변환하는 방법을 보여드리겠습니다. 시작하기 전에 세이프텐서가 설치되어 있는지 확인하세요: - -```bash -!pip install safetensors -``` - -['runwayml/stable-diffusion-v1-5`] (https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main) 리포지토리를 보면 `text_encoder`, `unet` 및 `vae` 하위 폴더에 가중치가 `.safetensors` 형식으로 저장되어 있는 것을 볼 수 있습니다. 기본적으로 🤗 디퓨저는 모델 저장소에서 사용할 수 있는 경우 해당 하위 폴더에서 이러한 '.safetensors` 파일을 자동으로 로드합니다. - -보다 명시적인 제어를 위해 선택적으로 `사용_세이프텐서=True`를 설정할 수 있습니다(`세이프텐서`가 설치되지 않은 경우 설치하라는 오류 메시지가 표시됨): - -```py -from diffusers import DiffusionPipeline - -pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True) -``` - -그러나 모델 가중치가 위의 예시처럼 반드시 별도의 하위 폴더에 저장되는 것은 아닙니다. 모든 가중치가 하나의 '.safetensors` 파일에 저장되는 경우도 있습니다. 이 경우 가중치가 Stable Diffusion 가중치인 경우 [`~diffusers.loaders.FromCkptMixin.from_ckpt`] 메서드를 사용하여 파일을 직접 로드할 수 있습니다: - -```py -from diffusers import StableDiffusionPipeline - -pipeline = StableDiffusionPipeline.from_ckpt( - "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors" -) -``` - -## 세이프텐서로 변환 - -허브의 모든 가중치를 '.safetensors` 형식으로 사용할 수 있는 것은 아니며, '.bin`으로 저장된 가중치가 있을 수 있습니다. 이 경우 [Convert Space](https://huggingface.co/spaces/diffusers/convert)을 사용하여 가중치를 '.safetensors'로 변환하세요. Convert Space는 피클된 가중치를 다운로드하여 변환한 후 풀 리퀘스트를 열어 허브에 새로 변환된 `.safetensors` 파일을 업로드합니다. 이렇게 하면 피클된 파일에 악성 코드가 포함되어 있는 경우, 안전하지 않은 파일과 의심스러운 피클 가져오기를 탐지하는 [보안 스캐너](https://huggingface.co/docs/hub/security-pickle#hubs-security-scanner)가 있는 허브로 업로드됩니다. - 개별 컴퓨터가 아닌. - -개정` 매개변수에 풀 리퀘스트에 대한 참조를 지정하여 새로운 '.safetensors` 가중치가 적용된 모델을 사용할 수 있습니다(허브의 [Check PR](https://huggingface.co/spaces/diffusers/check_pr) 공간에서 테스트할 수도 있음)(예: `refs/pr/22`): - -```py -from diffusers import DiffusionPipeline - -pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", revision="refs/pr/22") -``` - -## 세이프센서를 사용하는 이유는 무엇인가요? - -세이프티 센서를 사용하는 데에는 여러 가지 이유가 있습니다: - -- 세이프텐서를 사용하는 가장 큰 이유는 안전입니다.오픈 소스 및 모델 배포가 증가함에 따라 다운로드한 모델 가중치에 악성 코드가 포함되어 있지 않다는 것을 신뢰할 수 있는 것이 중요해졌습니다.세이프센서의 현재 헤더 크기는 매우 큰 JSON 파일을 구문 분석하지 못하게 합니다. -- 모델 전환 간의 로딩 속도는 텐서의 제로 카피를 수행하는 세이프텐서를 사용해야 하는 또 다른 이유입니다. 가중치를 CPU(기본값)로 로드하는 경우 '피클'에 비해 특히 빠르며, 가중치를 GPU로 직접 로드하는 경우에도 빠르지는 않더라도 비슷하게 빠릅니다. 모델이 이미 로드된 경우에만 성능 차이를 느낄 수 있으며, 가중치를 다운로드하거나 모델을 처음 로드하는 경우에는 성능 차이를 느끼지 못할 것입니다. - - 전체 파이프라인을 로드하는 데 걸리는 시간입니다: - - ```py - from diffusers import StableDiffusionPipeline - - pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1") - "Loaded in safetensors 0:00:02.033658" - "Loaded in PyTorch 0:00:02.663379" - ``` - - 하지만 실제로 500MB의 모델 가중치를 로드하는 데 걸리는 시간은 얼마 되지 않습니다: - - ```bash - safetensors: 3.4873ms - PyTorch: 172.7537ms - ``` - -지연 로딩은 세이프텐서에서도 지원되며, 이는 분산 설정에서 일부 텐서만 로드하는 데 유용합니다. 이 형식을 사용하면 [BLOOM](https://huggingface.co/bigscience/bloom) 모델을 일반 PyTorch 가중치를 사용하여 10분이 걸리던 것을 8개의 GPU에서 45초 만에 로드할 수 있습니다. diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py index cf558f082018..0f207a388356 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py @@ -57,7 +57,7 @@ StableDiffusionPipeline, UNet2DConditionModel, ) -from diffusers.loaders import LoraLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin from diffusers.optimization import get_scheduler from diffusers.training_utils import compute_snr from diffusers.utils import ( @@ -1302,7 +1302,7 @@ def save_model_hook(models, weights, output_dir): text_encoder_lora_layers=text_encoder_one_lora_layers_to_save, ) if args.train_text_encoder_ti: - embedding_handler.save_embeddings(f"{output_dir}/{args.output_dir}_emb.safetensors") + embedding_handler.save_embeddings(f"{args.output_dir}/{Path(args.output_dir).name}_emb.safetensors") def load_model_hook(models, input_dir): unet_ = None @@ -1318,11 +1318,11 @@ def load_model_hook(models, input_dir): else: raise ValueError(f"unexpected save model: {model.__class__}") - lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir) - LoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=unet_) + lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir) + StableDiffusionLoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=unet_) text_encoder_state_dict = {k: v for k, v in lora_state_dict.items() if "text_encoder." in k} - LoraLoaderMixin.load_lora_into_text_encoder( + StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder( text_encoder_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_one_ ) diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py index 9d06ce6cba16..a11a8afccbc6 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py @@ -60,7 +60,7 @@ StableDiffusionXLPipeline, UNet2DConditionModel, ) -from diffusers.loaders import LoraLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin from diffusers.optimization import get_scheduler from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr from diffusers.utils import ( @@ -1605,13 +1605,15 @@ def save_model_hook(models, weights, output_dir): if isinstance(model, type(unwrap_model(unet))): unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model)) elif isinstance(model, type(unwrap_model(text_encoder_one))): - text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers( - get_peft_model_state_dict(model) - ) + if args.train_text_encoder: + text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers( + get_peft_model_state_dict(model) + ) elif isinstance(model, type(unwrap_model(text_encoder_two))): - text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers( - get_peft_model_state_dict(model) - ) + if args.train_text_encoder: + text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers( + get_peft_model_state_dict(model) + ) else: raise ValueError(f"unexpected save model: {model.__class__}") @@ -1625,7 +1627,7 @@ def save_model_hook(models, weights, output_dir): text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save, ) if args.train_text_encoder_ti: - embedding_handler.save_embeddings(f"{output_dir}/{args.output_dir}_emb.safetensors") + embedding_handler.save_embeddings(f"{args.output_dir}/{Path(args.output_dir).name}_emb.safetensors") def load_model_hook(models, input_dir): unet_ = None @@ -1644,7 +1646,7 @@ def load_model_hook(models, input_dir): else: raise ValueError(f"unexpected save model: {model.__class__}") - lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir) + lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir) unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")} unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict) diff --git a/examples/amused/train_amused.py b/examples/amused/train_amused.py index 3ec0503dfdfe..ede51775dd8f 100644 --- a/examples/amused/train_amused.py +++ b/examples/amused/train_amused.py @@ -41,7 +41,7 @@ import diffusers.optimization from diffusers import AmusedPipeline, AmusedScheduler, EMAModel, UVit2DModel, VQModel -from diffusers.loaders import LoraLoaderMixin +from diffusers.loaders import AmusedLoraLoaderMixin from diffusers.utils import is_wandb_available @@ -532,7 +532,7 @@ def save_model_hook(models, weights, output_dir): weights.pop() if transformer_lora_layers_to_save is not None or text_encoder_lora_layers_to_save is not None: - LoraLoaderMixin.save_lora_weights( + AmusedLoraLoaderMixin.save_lora_weights( output_dir, transformer_lora_layers=transformer_lora_layers_to_save, text_encoder_lora_layers=text_encoder_lora_layers_to_save, @@ -566,11 +566,11 @@ def load_model_hook(models, input_dir): raise ValueError(f"unexpected save model: {model.__class__}") if transformer is not None or text_encoder_ is not None: - lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir) - LoraLoaderMixin.load_lora_into_text_encoder( + lora_state_dict, network_alphas = AmusedLoraLoaderMixin.lora_state_dict(input_dir) + AmusedLoraLoaderMixin.load_lora_into_text_encoder( lora_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_ ) - LoraLoaderMixin.load_lora_into_transformer( + AmusedLoraLoaderMixin.load_lora_into_transformer( lora_state_dict, network_alphas=network_alphas, transformer=transformer ) diff --git a/examples/community/README.md b/examples/community/README.md index f467ee38de3b..652d65f900fe 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -1641,18 +1641,18 @@ from io import BytesIO from PIL import Image import torch from diffusers import DDIMScheduler -from diffusers.pipelines.stable_diffusion import StableDiffusionImg2ImgPipeline +from diffusers import DiffusionPipeline # Use the DDIMScheduler scheduler here instead scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="scheduler") -pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", - custom_pipeline="stable_diffusion_tensorrt_img2img", - variant='fp16', - torch_dtype=torch.float16, - scheduler=scheduler,) +pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", + custom_pipeline="stable_diffusion_tensorrt_img2img", + variant='fp16', + torch_dtype=torch.float16, + scheduler=scheduler,) # re-use cached folder to save ONNX models and TensorRT Engines pipe.set_cached_folder("stabilityai/stable-diffusion-2-1", variant='fp16',) diff --git a/examples/community/checkpoint_merger.py b/examples/community/checkpoint_merger.py index f702bf0cea9b..6ba4b8c6e837 100644 --- a/examples/community/checkpoint_merger.py +++ b/examples/community/checkpoint_merger.py @@ -71,7 +71,7 @@ def merge(self, pretrained_model_name_or_path_list: List[Union[str, os.PathLike] **kwargs: Supports all the default DiffusionPipeline.get_config_dict kwargs viz.. - cache_dir, resume_download, force_download, proxies, local_files_only, token, revision, torch_dtype, device_map. + cache_dir, force_download, proxies, local_files_only, token, revision, torch_dtype, device_map. alpha - The interpolation parameter. Ranges from 0 to 1. It affects the ratio in which the checkpoints are merged. A 0.8 alpha would mean that the first model checkpoints would affect the final result far less than an alpha of 0.2 @@ -86,7 +86,6 @@ def merge(self, pretrained_model_name_or_path_list: List[Union[str, os.PathLike] """ # Default kwargs from DiffusionPipeline cache_dir = kwargs.pop("cache_dir", None) - resume_download = kwargs.pop("resume_download", False) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) @@ -124,7 +123,6 @@ def merge(self, pretrained_model_name_or_path_list: List[Union[str, os.PathLike] config_dict = DiffusionPipeline.load_config( pretrained_model_name_or_path, cache_dir=cache_dir, - resume_download=resume_download, force_download=force_download, proxies=proxies, local_files_only=local_files_only, @@ -160,7 +158,6 @@ def merge(self, pretrained_model_name_or_path_list: List[Union[str, os.PathLike] else snapshot_download( pretrained_model_name_or_path, cache_dir=cache_dir, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, diff --git a/examples/community/fresco_v2v.py b/examples/community/fresco_v2v.py index 1dd34bfcaa98..77071b023106 100644 --- a/examples/community/fresco_v2v.py +++ b/examples/community/fresco_v2v.py @@ -26,7 +26,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from diffusers.image_processor import PipelineImageInput, VaeImageProcessor -from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel from diffusers.models.attention_processor import AttnProcessor2_0 from diffusers.models.lora import adjust_lora_scale_text_encoder @@ -1252,8 +1252,8 @@ class FrescoV2VPipeline(StableDiffusionControlNetImg2ImgPipeline): The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -1456,7 +1456,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -1588,7 +1588,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/examples/community/gluegen.py b/examples/community/gluegen.py index 1ad6911905db..91026c5d966f 100644 --- a/examples/community/gluegen.py +++ b/examples/community/gluegen.py @@ -7,7 +7,7 @@ from diffusers import DiffusionPipeline from diffusers.image_processor import VaeImageProcessor -from diffusers.loaders import LoraLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.models.lora import adjust_lora_scale_text_encoder from diffusers.pipelines.pipeline_utils import StableDiffusionMixin @@ -194,7 +194,7 @@ def retrieve_timesteps( return timesteps, num_inference_steps -class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, LoraLoaderMixin): +class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionLoraLoaderMixin): def __init__( self, vae: AutoencoderKL, @@ -290,7 +290,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -424,7 +424,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/examples/community/instaflow_one_step.py b/examples/community/instaflow_one_step.py index ab0393c8f76c..3fef02287186 100644 --- a/examples/community/instaflow_one_step.py +++ b/examples/community/instaflow_one_step.py @@ -21,7 +21,7 @@ from diffusers.configuration_utils import FrozenDict from diffusers.image_processor import VaeImageProcessor -from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.models.lora import adjust_lora_scale_text_encoder from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin @@ -53,7 +53,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): class InstaFlowPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + StableDiffusionLoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for text-to-image generation using Rectified Flow and Euler discretization. @@ -64,8 +68,8 @@ class InstaFlowPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files Args: @@ -251,7 +255,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale diff --git a/examples/community/ip_adapter_face_id.py b/examples/community/ip_adapter_face_id.py index ae63113abef6..9d009c7745aa 100644 --- a/examples/community/ip_adapter_face_id.py +++ b/examples/community/ip_adapter_face_id.py @@ -24,7 +24,12 @@ from diffusers.configuration_utils import FrozenDict from diffusers.image_processor import VaeImageProcessor -from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import ( + FromSingleFileMixin, + IPAdapterMixin, + StableDiffusionLoraLoaderMixin, + TextualInversionLoaderMixin, +) from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.models.attention_processor import ( AttnProcessor, @@ -130,7 +135,7 @@ class IPAdapterFaceIDStableDiffusionPipeline( DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin, ): @@ -142,8 +147,8 @@ class IPAdapterFaceIDStableDiffusionPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -267,7 +272,6 @@ def __init__( def load_ip_adapter_face_id(self, pretrained_model_name_or_path_or_dict, weight_name, **kwargs): cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) token = kwargs.pop("token", None) @@ -283,7 +287,6 @@ def load_ip_adapter_face_id(self, pretrained_model_name_or_path_or_dict, weight_ weights_name=weight_name, cache_dir=cache_dir, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -520,7 +523,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -652,7 +655,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/examples/community/kohya_hires_fix.py b/examples/community/kohya_hires_fix.py index 867d636c7cae..0e36f32b19a3 100644 --- a/examples/community/kohya_hires_fix.py +++ b/examples/community/kohya_hires_fix.py @@ -395,8 +395,8 @@ class StableDiffusionHighResFixPipeline(StableDiffusionPipeline): The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters diff --git a/examples/community/latent_consistency_interpolate.py b/examples/community/latent_consistency_interpolate.py index 8db70d3b9508..84adc125b191 100644 --- a/examples/community/latent_consistency_interpolate.py +++ b/examples/community/latent_consistency_interpolate.py @@ -6,7 +6,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from diffusers.image_processor import VaeImageProcessor -from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.models.lora import adjust_lora_scale_text_encoder from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin @@ -190,7 +190,11 @@ def slerp( class LatentConsistencyModelWalkPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + StableDiffusionLoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for text-to-image generation using a latent consistency model. @@ -200,8 +204,8 @@ class LatentConsistencyModelWalkPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files Args: @@ -317,7 +321,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -449,7 +453,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/examples/community/llm_grounded_diffusion.py b/examples/community/llm_grounded_diffusion.py index b25da201ddbe..49c074911354 100644 --- a/examples/community/llm_grounded_diffusion.py +++ b/examples/community/llm_grounded_diffusion.py @@ -29,7 +29,12 @@ from diffusers.configuration_utils import FrozenDict from diffusers.image_processor import PipelineImageInput, VaeImageProcessor -from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import ( + FromSingleFileMixin, + IPAdapterMixin, + StableDiffusionLoraLoaderMixin, + TextualInversionLoaderMixin, +) from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.models.attention import Attention, GatedSelfAttentionDense from diffusers.models.attention_processor import AttnProcessor2_0 @@ -271,7 +276,7 @@ class LLMGroundedDiffusionPipeline( DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin, ): @@ -1263,7 +1268,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -1397,7 +1402,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py index 3665794db8f7..cb5466ff3b95 100644 --- a/examples/community/lpw_stable_diffusion.py +++ b/examples/community/lpw_stable_diffusion.py @@ -14,7 +14,7 @@ from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict from diffusers.image_processor import VaeImageProcessor -from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker @@ -412,7 +412,11 @@ def preprocess_mask(mask, batch_size, scale_factor=8): class StableDiffusionLongPromptWeightingPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + StableDiffusionLoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing diff --git a/examples/community/lpw_stable_diffusion_xl.py b/examples/community/lpw_stable_diffusion_xl.py index b414fbe060b7..e78d21de71c2 100644 --- a/examples/community/lpw_stable_diffusion_xl.py +++ b/examples/community/lpw_stable_diffusion_xl.py @@ -22,7 +22,12 @@ from diffusers import DiffusionPipeline, StableDiffusionXLPipeline from diffusers.image_processor import PipelineImageInput, VaeImageProcessor -from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import ( + FromSingleFileMixin, + IPAdapterMixin, + StableDiffusionLoraLoaderMixin, + TextualInversionLoaderMixin, +) from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel from diffusers.models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor from diffusers.pipelines.pipeline_utils import StableDiffusionMixin @@ -544,7 +549,7 @@ class SDXLLongPromptWeightingPipeline( StableDiffusionMixin, FromSingleFileMixin, IPAdapterMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin, ): r""" @@ -556,8 +561,8 @@ class SDXLLongPromptWeightingPipeline( The pipeline also inherits the following loading methods: - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings Args: @@ -738,7 +743,7 @@ def encode_prompt( # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale if prompt is not None and isinstance(prompt, str): diff --git a/examples/community/pipeline_animatediff_controlnet.py b/examples/community/pipeline_animatediff_controlnet.py index 4428efd0a1d3..99638e606399 100644 --- a/examples/community/pipeline_animatediff_controlnet.py +++ b/examples/community/pipeline_animatediff_controlnet.py @@ -22,7 +22,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from diffusers.image_processor import PipelineImageInput, VaeImageProcessor -from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel, UNetMotionModel from diffusers.models.lora import adjust_lora_scale_text_encoder from diffusers.models.unets.unet_motion_model import MotionAdapter @@ -114,7 +114,11 @@ def tensor2vid(video: torch.Tensor, processor, output_type="np"): class AnimateDiffControlNetPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + IPAdapterMixin, + StableDiffusionLoraLoaderMixin, ): r""" Pipeline for text-to-video generation. @@ -124,8 +128,8 @@ class AnimateDiffControlNetPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: @@ -234,7 +238,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -366,7 +370,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/examples/community/pipeline_animatediff_img2video.py b/examples/community/pipeline_animatediff_img2video.py index 7546fbd9bcb0..0a578d4b8ef6 100644 --- a/examples/community/pipeline_animatediff_img2video.py +++ b/examples/community/pipeline_animatediff_img2video.py @@ -27,7 +27,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from diffusers.image_processor import PipelineImageInput, VaeImageProcessor -from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel from diffusers.models.lora import adjust_lora_scale_text_encoder from diffusers.models.unet_motion_model import MotionAdapter @@ -240,7 +240,11 @@ def retrieve_timesteps( class AnimateDiffImgToVideoPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + IPAdapterMixin, + StableDiffusionLoraLoaderMixin, ): r""" Pipeline for image-to-video generation. @@ -250,8 +254,8 @@ class AnimateDiffImgToVideoPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: @@ -351,7 +355,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -483,7 +487,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/examples/community/pipeline_demofusion_sdxl.py b/examples/community/pipeline_demofusion_sdxl.py index e85ea1612ddd..f83d1b401420 100644 --- a/examples/community/pipeline_demofusion_sdxl.py +++ b/examples/community/pipeline_demofusion_sdxl.py @@ -12,7 +12,7 @@ from diffusers.image_processor import VaeImageProcessor from diffusers.loaders import ( FromSingleFileMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin, ) from diffusers.models import AutoencoderKL, UNet2DConditionModel @@ -89,7 +89,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): class DemoFusionSDXLPipeline( - DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin + DiffusionPipeline, + StableDiffusionMixin, + FromSingleFileMixin, + StableDiffusionLoraLoaderMixin, + TextualInversionLoaderMixin, ): r""" Pipeline for text-to-image generation using Stable Diffusion XL. @@ -231,7 +235,7 @@ def encode_prompt( # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale diff --git a/examples/community/pipeline_fabric.py b/examples/community/pipeline_fabric.py index f17c8e52f595..02fdcd04c103 100644 --- a/examples/community/pipeline_fabric.py +++ b/examples/community/pipeline_fabric.py @@ -21,7 +21,7 @@ from diffusers import AutoencoderKL, UNet2DConditionModel from diffusers.configuration_utils import FrozenDict from diffusers.image_processor import VaeImageProcessor -from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models.attention import BasicTransformerBlock from diffusers.models.attention_processor import LoRAAttnProcessor from diffusers.pipelines.pipeline_utils import DiffusionPipeline @@ -222,7 +222,7 @@ def _encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale if prompt is not None and isinstance(prompt, str): diff --git a/examples/community/pipeline_prompt2prompt.py b/examples/community/pipeline_prompt2prompt.py index 8e9bcddfef99..508e84177928 100644 --- a/examples/community/pipeline_prompt2prompt.py +++ b/examples/community/pipeline_prompt2prompt.py @@ -35,7 +35,7 @@ from diffusers.loaders import ( FromSingleFileMixin, IPAdapterMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin, ) from diffusers.models.attention import Attention @@ -75,7 +75,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): class Prompt2PromptPipeline( DiffusionPipeline, TextualInversionLoaderMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin, ): @@ -87,8 +87,8 @@ class Prompt2PromptPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -286,7 +286,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -420,7 +420,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/examples/community/pipeline_stable_diffusion_boxdiff.py b/examples/community/pipeline_stable_diffusion_boxdiff.py index 871c66743cdb..93891c6f9854 100644 --- a/examples/community/pipeline_stable_diffusion_boxdiff.py +++ b/examples/community/pipeline_stable_diffusion_boxdiff.py @@ -27,7 +27,12 @@ from diffusers.configuration_utils import FrozenDict from diffusers.image_processor import PipelineImageInput, VaeImageProcessor -from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import ( + FromSingleFileMixin, + IPAdapterMixin, + StableDiffusionLoraLoaderMixin, + TextualInversionLoaderMixin, +) from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel from diffusers.models.attention_processor import Attention, FusedAttnProcessor2_0 from diffusers.models.lora import adjust_lora_scale_text_encoder @@ -358,7 +363,7 @@ def retrieve_timesteps( class StableDiffusionBoxDiffPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin + DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin ): r""" Pipeline for text-to-image generation using Stable Diffusion with BoxDiff. @@ -368,8 +373,8 @@ class StableDiffusionBoxDiffPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -594,7 +599,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -726,7 +731,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/examples/community/pipeline_stable_diffusion_pag.py b/examples/community/pipeline_stable_diffusion_pag.py index c22027fbc4ec..6c2b2c32b759 100644 --- a/examples/community/pipeline_stable_diffusion_pag.py +++ b/examples/community/pipeline_stable_diffusion_pag.py @@ -11,7 +11,12 @@ from diffusers.configuration_utils import FrozenDict from diffusers.image_processor import PipelineImageInput, VaeImageProcessor -from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import ( + FromSingleFileMixin, + IPAdapterMixin, + StableDiffusionLoraLoaderMixin, + TextualInversionLoaderMixin, +) from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel from diffusers.models.attention_processor import Attention, AttnProcessor2_0, FusedAttnProcessor2_0 from diffusers.models.lora import adjust_lora_scale_text_encoder @@ -328,7 +333,7 @@ def retrieve_timesteps( class StableDiffusionPAGPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin + DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin ): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -336,8 +341,8 @@ class StableDiffusionPAGPipeline( implemented for all pipelines (downloading, saving, running on a particular device, etc.). The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: @@ -560,7 +565,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -692,7 +697,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py b/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py index a873e7b2956e..1ac651a1fe60 100644 --- a/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py +++ b/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py @@ -22,7 +22,7 @@ from diffusers import DiffusionPipeline from diffusers.image_processor import PipelineDepthInput, PipelineImageInput, VaeImageProcessorLDM3D -from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.models.lora import adjust_lora_scale_text_encoder from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker @@ -69,7 +69,7 @@ class StableDiffusionUpscaleLDM3DPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, FromSingleFileMixin ): r""" Pipeline for text-to-image and 3D generation using LDM3D. @@ -79,8 +79,8 @@ class StableDiffusionUpscaleLDM3DPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files Args: @@ -233,7 +233,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -365,7 +365,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py index 0f0cf5dba83f..07954f013295 100644 --- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py +++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py @@ -33,7 +33,7 @@ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor from diffusers.loaders import ( FromSingleFileMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin, ) @@ -300,7 +300,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): class StableDiffusionXLControlNetAdapterInpaintPipeline( - DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, LoraLoaderMixin + DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, StableDiffusionLoraLoaderMixin ): r""" Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter diff --git a/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py b/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py index 7853adbb0d8f..f576cc0c42d8 100644 --- a/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py +++ b/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py @@ -178,11 +178,11 @@ class StableDiffusionXLDifferentialImg2ImgPipeline( In addition the pipeline inherits the following loading methods: - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] - - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`] as well as the following saving methods: - - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] Args: vae ([`AutoencoderKL`]): diff --git a/examples/community/sde_drag.py b/examples/community/sde_drag.py index 08e865b9c350..902eaa99f417 100644 --- a/examples/community/sde_drag.py +++ b/examples/community/sde_drag.py @@ -11,7 +11,7 @@ from transformers import CLIPTextModel, CLIPTokenizer from diffusers import AutoencoderKL, DiffusionPipeline, DPMSolverMultistepScheduler, UNet2DConditionModel -from diffusers.loaders import AttnProcsLayers, LoraLoaderMixin +from diffusers.loaders import AttnProcsLayers, StableDiffusionLoraLoaderMixin from diffusers.models.attention_processor import ( AttnAddedKVProcessor, AttnAddedKVProcessor2_0, @@ -321,7 +321,7 @@ def train_lora(self, prompt, image, lora_step=100, lora_rank=16, generator=None) optimizer.zero_grad() with tempfile.TemporaryDirectory() as save_lora_dir: - LoraLoaderMixin.save_lora_weights( + StableDiffusionLoraLoaderMixin.save_lora_weights( save_directory=save_lora_dir, unet_lora_layers=unet_lora_layers, text_encoder_lora_layers=None, diff --git a/examples/community/stable_diffusion_ipex.py b/examples/community/stable_diffusion_ipex.py index 92588ba8a2e8..388992a740ec 100644 --- a/examples/community/stable_diffusion_ipex.py +++ b/examples/community/stable_diffusion_ipex.py @@ -21,7 +21,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from diffusers.configuration_utils import FrozenDict -from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput @@ -61,7 +61,7 @@ class StableDiffusionIPEXPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin ): r""" Pipeline for text-to-image generation using Stable Diffusion on IPEX. diff --git a/examples/community/stable_diffusion_reference.py b/examples/community/stable_diffusion_reference.py index 15c3f8845f3b..efb0fa89dbfc 100644 --- a/examples/community/stable_diffusion_reference.py +++ b/examples/community/stable_diffusion_reference.py @@ -11,7 +11,12 @@ from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel from diffusers.configuration_utils import FrozenDict, deprecate from diffusers.image_processor import VaeImageProcessor -from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import ( + FromSingleFileMixin, + IPAdapterMixin, + StableDiffusionLoraLoaderMixin, + TextualInversionLoaderMixin, +) from diffusers.models.attention import BasicTransformerBlock from diffusers.models.lora import adjust_lora_scale_text_encoder from diffusers.models.unets.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D @@ -76,7 +81,7 @@ def torch_dfs(model: torch.nn.Module): class StableDiffusionReferencePipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin + DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin ): r""" Pipeline for Stable Diffusion Reference. @@ -86,8 +91,8 @@ class StableDiffusionReferencePipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -443,7 +448,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -575,7 +580,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/examples/community/stable_diffusion_repaint.py b/examples/community/stable_diffusion_repaint.py index 2addc5a62dbb..980e9a155997 100644 --- a/examples/community/stable_diffusion_repaint.py +++ b/examples/community/stable_diffusion_repaint.py @@ -23,7 +23,7 @@ from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel from diffusers.configuration_utils import FrozenDict, deprecate -from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from diffusers.pipelines.pipeline_utils import StableDiffusionMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import ( @@ -140,7 +140,7 @@ def prepare_mask_and_masked_image(image, mask): class StableDiffusionRepaintPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin ): r""" Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*. @@ -148,9 +148,9 @@ class StableDiffusionRepaintPipeline( library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) In addition the pipeline inherits the following loading methods: - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] - - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] as well as the following saving methods: - - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. diff --git a/examples/community/stable_diffusion_tensorrt_img2img.py b/examples/community/stable_diffusion_tensorrt_img2img.py index 7264a60506fe..40ad38bfe903 100755 --- a/examples/community/stable_diffusion_tensorrt_img2img.py +++ b/examples/community/stable_diffusion_tensorrt_img2img.py @@ -18,8 +18,7 @@ import gc import os from collections import OrderedDict -from copy import copy -from typing import List, Optional, Union +from typing import List, Optional, Tuple, Union import numpy as np import onnx @@ -27,9 +26,11 @@ import PIL.Image import tensorrt as trt import torch +from cuda import cudart from huggingface_hub import snapshot_download from huggingface_hub.utils import validate_hf_hub_args from onnx import shape_inference +from packaging import version from polygraphy import cuda from polygraphy.backend.common import bytes_from_path from polygraphy.backend.onnx.loader import fold_constants @@ -41,12 +42,13 @@ network_from_onnx_path, save_engine, ) -from polygraphy.backend.trt import util as trt_util from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection +from diffusers import DiffusionPipeline +from diffusers.configuration_utils import FrozenDict, deprecate +from diffusers.image_processor import VaeImageProcessor from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.pipelines.stable_diffusion import ( - StableDiffusionImg2ImgPipeline, StableDiffusionPipelineOutput, StableDiffusionSafetyChecker, ) @@ -58,7 +60,7 @@ """ Installation instructions python3 -m pip install --upgrade transformers diffusers>=0.16.0 -python3 -m pip install --upgrade tensorrt>=8.6.1 +python3 -m pip install --upgrade tensorrt-cu12==10.2.0 python3 -m pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com python3 -m pip install onnxruntime """ @@ -88,10 +90,6 @@ torch_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()} -def device_view(t): - return cuda.DeviceView(ptr=t.data_ptr(), shape=t.shape, dtype=torch_to_numpy_dtype_dict[t.dtype]) - - def preprocess_image(image): """ image: torch.Tensor @@ -125,10 +123,8 @@ def build( onnx_path, fp16, input_profile=None, - enable_preview=False, enable_all_tactics=False, timing_cache=None, - workspace_size=0, ): logger.warning(f"Building TensorRT engine for {onnx_path}: {self.engine_path}") p = Profile() @@ -137,20 +133,13 @@ def build( assert len(dims) == 3 p.add(name, min=dims[0], opt=dims[1], max=dims[2]) - config_kwargs = {} - - config_kwargs["preview_features"] = [trt.PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805] - if enable_preview: - # Faster dynamic shapes made optional since it increases engine build time. - config_kwargs["preview_features"].append(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805) - if workspace_size > 0: - config_kwargs["memory_pool_limits"] = {trt.MemoryPoolType.WORKSPACE: workspace_size} + extra_build_args = {} if not enable_all_tactics: - config_kwargs["tactic_sources"] = [] + extra_build_args["tactic_sources"] = [] engine = engine_from_network( network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]), - config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **config_kwargs), + config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **extra_build_args), save_timing_cache=timing_cache, ) save_engine(engine, path=self.engine_path) @@ -163,28 +152,24 @@ def activate(self): self.context = self.engine.create_execution_context() def allocate_buffers(self, shape_dict=None, device="cuda"): - for idx in range(trt_util.get_bindings_per_profile(self.engine)): - binding = self.engine[idx] - if shape_dict and binding in shape_dict: - shape = shape_dict[binding] + for binding in range(self.engine.num_io_tensors): + name = self.engine.get_tensor_name(binding) + if shape_dict and name in shape_dict: + shape = shape_dict[name] else: - shape = self.engine.get_binding_shape(binding) - dtype = trt.nptype(self.engine.get_binding_dtype(binding)) - if self.engine.binding_is_input(binding): - self.context.set_binding_shape(idx, shape) + shape = self.engine.get_tensor_shape(name) + dtype = trt.nptype(self.engine.get_tensor_dtype(name)) + if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: + self.context.set_input_shape(name, shape) tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to(device=device) - self.tensors[binding] = tensor - self.buffers[binding] = cuda.DeviceView(ptr=tensor.data_ptr(), shape=shape, dtype=dtype) + self.tensors[name] = tensor def infer(self, feed_dict, stream): - start_binding, end_binding = trt_util.get_active_profile_bindings(self.context) - # shallow copy of ordered dict - device_buffers = copy(self.buffers) for name, buf in feed_dict.items(): - assert isinstance(buf, cuda.DeviceView) - device_buffers[name] = buf - bindings = [0] * start_binding + [buf.ptr for buf in device_buffers.values()] - noerror = self.context.execute_async_v2(bindings=bindings, stream_handle=stream.ptr) + self.tensors[name].copy_(buf) + for name, tensor in self.tensors.items(): + self.context.set_tensor_address(name, tensor.data_ptr()) + noerror = self.context.execute_async_v3(stream) if not noerror: raise ValueError("ERROR: inference failed.") @@ -325,10 +310,8 @@ def build_engines( force_engine_rebuild=False, static_batch=False, static_shape=True, - enable_preview=False, enable_all_tactics=False, timing_cache=None, - max_workspace_size=0, ): built_engines = {} if not os.path.isdir(onnx_dir): @@ -393,9 +376,7 @@ def build_engines( static_batch=static_batch, static_shape=static_shape, ), - enable_preview=enable_preview, timing_cache=timing_cache, - workspace_size=max_workspace_size, ) built_engines[model_name] = engine @@ -674,7 +655,7 @@ def make_VAEEncoder(model, device, max_batch_size, embedding_dim, inpaint=False) return VAEEncoder(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim) -class TensorRTStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline): +class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline): r""" Pipeline for image-to-image generation using TensorRT accelerated Stable Diffusion. @@ -702,6 +683,8 @@ class TensorRTStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] + def __init__( self, vae: AutoencoderKL, @@ -722,24 +705,86 @@ def __init__( onnx_dir: str = "onnx", # TensorRT engine build parameters engine_dir: str = "engine", - build_preview_features: bool = True, force_engine_rebuild: bool = False, timing_cache: str = "timing_cache", ): - super().__init__( - vae, - text_encoder, - tokenizer, - unet, - scheduler, + super().__init__() + + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" + f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " + "to update the config accordingly as leaving `steps_offset` might led to incorrect results" + " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," + " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["steps_offset"] = 1 + scheduler._internal_dict = FrozenDict(new_config) + + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." + " `clip_sample` should be set to False in the configuration file. Please make sure to update the" + " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in" + " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" + " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" + ) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["clip_sample"] = False + scheduler._internal_dict = FrozenDict(new_config) + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse( + version.parse(unet.config._diffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 + if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: + deprecation_message = ( + "The configuration file of the unet has set the default `sample_size` to smaller than" + " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the" + " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-" + " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5" + " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the" + " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" + " in the config might lead to incorrect results in future versions. If you have downloaded this" + " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(unet.config) + new_config["sample_size"] = 64 + unet._internal_dict = FrozenDict(new_config) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, image_encoder=image_encoder, - requires_safety_checker=requires_safety_checker, ) - self.vae.forward = self.vae.decode - self.stages = stages self.image_height, self.image_width = image_height, image_width self.inpaint = False @@ -750,7 +795,6 @@ def __init__( self.timing_cache = timing_cache self.build_static_batch = False self.build_dynamic_shape = False - self.build_preview_features = build_preview_features self.max_batch_size = max_batch_size # TODO: Restrict batch size to 4 for larger image dimensions as a WAR for TensorRT limitation. @@ -761,6 +805,11 @@ def __init__( self.models = {} # loaded in __loadModels() self.engine = {} # loaded in build_engines() + self.vae.forward = self.vae.decode + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.register_to_config(requires_safety_checker=requires_safety_checker) + def __loadModels(self): # Load pipeline models self.embedding_dim = self.text_encoder.config.hidden_size @@ -779,11 +828,37 @@ def __loadModels(self): if "vae_encoder" in self.stages: self.models["vae_encoder"] = make_VAEEncoder(self.vae, **models_args) + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker + def run_safety_checker( + self, image: Union[torch.Tensor, PIL.Image.Image], device: torch.device, dtype: torch.dtype + ) -> Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]: + r""" + Runs the safety checker on the given image. + Args: + image (Union[torch.Tensor, PIL.Image.Image]): The input image to be checked. + device (torch.device): The device to run the safety checker on. + dtype (torch.dtype): The data type of the input image. + Returns: + (image, has_nsfw_concept) Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]: A tuple containing the processed image and + a boolean indicating whether the image has a NSFW (Not Safe for Work) concept. + """ + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept + @classmethod @validate_hf_hub_args def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): cache_dir = kwargs.pop("cache_dir", None) - resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) token = kwargs.pop("token", None) @@ -795,7 +870,6 @@ def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os else snapshot_download( pretrained_model_name_or_path, cache_dir=cache_dir, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -828,7 +902,6 @@ def to(self, torch_device: Optional[Union[str, torch.device]] = None, silence_dt force_engine_rebuild=self.force_engine_rebuild, static_batch=self.build_static_batch, static_shape=not self.build_dynamic_shape, - enable_preview=self.build_preview_features, timing_cache=self.timing_cache, ) @@ -852,9 +925,7 @@ def __preprocess_images(self, batch_size, images=()): return tuple(init_images) def __encode_image(self, init_image): - init_latents = runEngine(self.engine["vae_encoder"], {"images": device_view(init_image)}, self.stream)[ - "latent" - ] + init_latents = runEngine(self.engine["vae_encoder"], {"images": init_image}, self.stream)["latent"] init_latents = 0.18215 * init_latents return init_latents @@ -883,9 +954,8 @@ def __encode_prompt(self, prompt, negative_prompt): .to(self.torch_device) ) - text_input_ids_inp = device_view(text_input_ids) # NOTE: output tensor for CLIP must be cloned because it will be overwritten when called again for negative prompt - text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids_inp}, self.stream)[ + text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids}, self.stream)[ "text_embeddings" ].clone() @@ -901,8 +971,7 @@ def __encode_prompt(self, prompt, negative_prompt): .input_ids.type(torch.int32) .to(self.torch_device) ) - uncond_input_ids_inp = device_view(uncond_input_ids) - uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids_inp}, self.stream)[ + uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids}, self.stream)[ "text_embeddings" ] @@ -926,18 +995,15 @@ def __denoise_latent( # Predict the noise residual timestep_float = timestep.float() if timestep.dtype != torch.float32 else timestep - sample_inp = device_view(latent_model_input) - timestep_inp = device_view(timestep_float) - embeddings_inp = device_view(text_embeddings) noise_pred = runEngine( self.engine["unet"], - {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp}, + {"sample": latent_model_input, "timestep": timestep_float, "encoder_hidden_states": text_embeddings}, self.stream, )["latent"] # Perform guidance noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond) latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample @@ -945,12 +1011,12 @@ def __denoise_latent( return latents def __decode_latent(self, latents): - images = runEngine(self.engine["vae"], {"latent": device_view(latents)}, self.stream)["images"] + images = runEngine(self.engine["vae"], {"latent": latents}, self.stream)["images"] images = (images / 2 + 0.5).clamp(0, 1) return images.cpu().permute(0, 2, 3, 1).float().numpy() def __loadResources(self, image_height, image_width, batch_size): - self.stream = cuda.Stream() + self.stream = cudart.cudaStreamCreate()[1] # Allocate buffers for TensorRT engine bindings for model_name, obj in self.models.items(): @@ -1063,5 +1129,6 @@ def __call__( # VAE decode latent images = self.__decode_latent(latents) + images, has_nsfw_concept = self.run_safety_checker(images, self.torch_device, text_embeddings.dtype) images = self.numpy_to_pil(images) - return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=None) + return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept) diff --git a/examples/community/stable_diffusion_tensorrt_inpaint.py b/examples/community/stable_diffusion_tensorrt_inpaint.py index b2d61a3dab93..8bacd050571a 100755 --- a/examples/community/stable_diffusion_tensorrt_inpaint.py +++ b/examples/community/stable_diffusion_tensorrt_inpaint.py @@ -783,7 +783,6 @@ def __loadModels(self): @validate_hf_hub_args def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): cache_dir = kwargs.pop("cache_dir", None) - resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) token = kwargs.pop("token", None) @@ -795,7 +794,6 @@ def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os else snapshot_download( pretrained_model_name_or_path, cache_dir=cache_dir, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, diff --git a/examples/community/stable_diffusion_tensorrt_txt2img.py b/examples/community/stable_diffusion_tensorrt_txt2img.py index 1fcfafadb4f7..6072a357bc5d 100755 --- a/examples/community/stable_diffusion_tensorrt_txt2img.py +++ b/examples/community/stable_diffusion_tensorrt_txt2img.py @@ -695,7 +695,6 @@ def __loadModels(self): @validate_hf_hub_args def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): cache_dir = kwargs.pop("cache_dir", None) - resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) token = kwargs.pop("token", None) @@ -707,7 +706,6 @@ def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os else snapshot_download( pretrained_model_name_or_path, cache_dir=cache_dir, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md index 351861159cf7..a331d42e7fa3 100644 --- a/examples/dreambooth/README.md +++ b/examples/dreambooth/README.md @@ -425,8 +425,8 @@ pipe.load_lora_weights(lora_model_id) image = pipe("A picture of a sks dog in a bucket", num_inference_steps=25).images[0] ``` -Note that the use of [`LoraLoaderMixin.load_lora_weights`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights) is preferred to [`UNet2DConditionLoadersMixin.load_attn_procs`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs) for loading LoRA parameters. This is because -`LoraLoaderMixin.load_lora_weights` can handle the following situations: +Note that the use of [`StableDiffusionLoraLoaderMixin.load_lora_weights`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.StableDiffusionLoraLoaderMixin.load_lora_weights) is preferred to [`UNet2DConditionLoadersMixin.load_attn_procs`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs) for loading LoRA parameters. This is because +`StableDiffusionLoraLoaderMixin.load_lora_weights` can handle the following situations: * LoRA parameters that don't have separate identifiers for the UNet and the text encoder (such as [`"patrickvonplaten/lora_dreambooth_dog_example"`](https://huggingface.co/patrickvonplaten/lora_dreambooth_dog_example)). So, you can just do: diff --git a/examples/dreambooth/README_sd3.md b/examples/dreambooth/README_sd3.md index e3d2247d974e..052e383ef6f0 100644 --- a/examples/dreambooth/README_sd3.md +++ b/examples/dreambooth/README_sd3.md @@ -183,4 +183,6 @@ accelerate launch train_dreambooth_lora_sd3.py \ ## Other notes -We default to the "logit_normal" weighting scheme for the loss following the SD3 paper. Thanks to @bghira for helping us discover that for other weighting schemes supported from the training script, training may incur numerical instabilities. \ No newline at end of file +1. We default to the "logit_normal" weighting scheme for the loss following the SD3 paper. Thanks to @bghira for helping us discover that for other weighting schemes supported from the training script, training may incur numerical instabilities. +2. Thanks to `bghira`, `JinxuXiang`, and `bendanzzc` for helping us discover a bug in how VAE encoding was being done previously. This has been fixed in [#8917](https://github.com/huggingface/diffusers/pull/8917). +3. Additionally, we now have the option to control if we want to apply preconditioning to the model outputs via a `--precondition_outputs` CLI arg. It affects how the model `target` is calculated as well. \ No newline at end of file diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index c8af49ac0373..ac3e4ad69616 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -52,7 +52,7 @@ StableDiffusionPipeline, UNet2DConditionModel, ) -from diffusers.loaders import LoraLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin from diffusers.optimization import get_scheduler from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params from diffusers.utils import ( @@ -956,7 +956,7 @@ def save_model_hook(models, weights, output_dir): # make sure to pop weight so that corresponding model is not saved again weights.pop() - LoraLoaderMixin.save_lora_weights( + StableDiffusionLoraLoaderMixin.save_lora_weights( output_dir, unet_lora_layers=unet_lora_layers_to_save, text_encoder_lora_layers=text_encoder_lora_layers_to_save, @@ -976,7 +976,7 @@ def load_model_hook(models, input_dir): else: raise ValueError(f"unexpected save model: {model.__class__}") - lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir) + lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir) unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")} unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict) @@ -1376,7 +1376,7 @@ def compute_text_embeddings(prompt): else: text_encoder_state_dict = None - LoraLoaderMixin.save_lora_weights( + StableDiffusionLoraLoaderMixin.save_lora_weights( save_directory=args.output_dir, unet_lora_layers=unet_lora_state_dict, text_encoder_lora_layers=text_encoder_state_dict, diff --git a/examples/dreambooth/train_dreambooth_lora_sd3.py b/examples/dreambooth/train_dreambooth_lora_sd3.py index 5401ee570a34..1d7078628e4c 100644 --- a/examples/dreambooth/train_dreambooth_lora_sd3.py +++ b/examples/dreambooth/train_dreambooth_lora_sd3.py @@ -523,6 +523,13 @@ def parse_args(input_args=None): default=1.29, help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.", ) + parser.add_argument( + "--precondition_outputs", + type=int, + default=1, + help="Flag indicating if we are preconditioning the model outputs or not as done in EDM. This affects how " + "model `target` is calculated.", + ) parser.add_argument( "--optimizer", type=str, @@ -1636,7 +1643,7 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): # Convert images to latent space model_input = vae.encode(pixel_values).latent_dist.sample() - model_input = model_input * vae.config.scaling_factor + model_input = (model_input - vae.config.shift_factor) * vae.config.scaling_factor model_input = model_input.to(dtype=weight_dtype) # Sample noise that we'll add to the latents @@ -1656,8 +1663,9 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device) # Add noise according to flow matching. + # zt = (1 - texp) * x + texp * z1 sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype) - noisy_model_input = sigmas * noise + (1.0 - sigmas) * model_input + noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise # Predict the noise residual model_pred = transformer( @@ -1670,14 +1678,18 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): # Follow: Section 5 of https://arxiv.org/abs/2206.00364. # Preconditioning of the model outputs. - model_pred = model_pred * (-sigmas) + noisy_model_input + if args.precondition_outputs: + model_pred = model_pred * (-sigmas) + noisy_model_input # these weighting schemes use a uniform timestep sampling # and instead post-weight the loss weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas) # flow matching loss - target = model_input + if args.precondition_outputs: + target = model_input + else: + target = noise - model_input if args.with_prior_preservation: # Chunk the noise and model_pred into two parts and compute the loss on each part separately. diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py index f5b6e5f65d56..68f55e1faf4b 100644 --- a/examples/dreambooth/train_dreambooth_lora_sdxl.py +++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py @@ -58,7 +58,7 @@ StableDiffusionXLPipeline, UNet2DConditionModel, ) -from diffusers.loaders import LoraLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin from diffusers.optimization import get_scheduler from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr from diffusers.utils import ( @@ -1260,7 +1260,7 @@ def load_model_hook(models, input_dir): else: raise ValueError(f"unexpected save model: {model.__class__}") - lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir) + lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir) unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")} unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict) diff --git a/examples/dreambooth/train_dreambooth_sd3.py b/examples/dreambooth/train_dreambooth_sd3.py index 9a72294c20bd..ebd30468b313 100644 --- a/examples/dreambooth/train_dreambooth_sd3.py +++ b/examples/dreambooth/train_dreambooth_sd3.py @@ -494,6 +494,13 @@ def parse_args(input_args=None): default=1.29, help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.", ) + parser.add_argument( + "--precondition_outputs", + type=int, + default=1, + help="Flag indicating if we are preconditioning the model outputs or not as done in EDM. This affects how " + "model `target` is calculated.", + ) parser.add_argument( "--optimizer", type=str, @@ -1549,7 +1556,7 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): # Convert images to latent space model_input = vae.encode(pixel_values).latent_dist.sample() - model_input = model_input * vae.config.scaling_factor + model_input = (model_input - vae.config.shift_factor) * vae.config.scaling_factor model_input = model_input.to(dtype=weight_dtype) # Sample noise that we'll add to the latents @@ -1569,8 +1576,9 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device) # Add noise according to flow matching. + # zt = (1 - texp) * x + texp * z1 sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype) - noisy_model_input = sigmas * noise + (1.0 - sigmas) * model_input + noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise # Predict the noise residual if not args.train_text_encoder: @@ -1598,13 +1606,18 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): # Follow: Section 5 of https://arxiv.org/abs/2206.00364. # Preconditioning of the model outputs. - model_pred = model_pred * (-sigmas) + noisy_model_input + if args.precondition_outputs: + model_pred = model_pred * (-sigmas) + noisy_model_input + # these weighting schemes use a uniform timestep sampling # and instead post-weight the loss weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas) # flow matching loss - target = model_input + if args.precondition_outputs: + target = model_input + else: + target = noise - model_input if args.with_prior_preservation: # Chunk the noise and model_pred into two parts and compute the loss on each part separately. diff --git a/examples/research_projects/consistency_training/train_cm_ct_unconditional.py b/examples/research_projects/consistency_training/train_cm_ct_unconditional.py index b7a1e2a545f8..eccc539f230c 100644 --- a/examples/research_projects/consistency_training/train_cm_ct_unconditional.py +++ b/examples/research_projects/consistency_training/train_cm_ct_unconditional.py @@ -1195,7 +1195,7 @@ def unwrap_model(model): # Resolve the c parameter for the Pseudo-Huber loss if args.huber_c is None: - args.huber_c = 0.00054 * args.resolution * math.sqrt(unet.config.in_channels) + args.huber_c = 0.00054 * args.resolution * math.sqrt(unwrap_model(unet).config.in_channels) # Get current number of discretization steps N according to our discretization curriculum current_discretization_steps = get_discretization_steps( diff --git a/examples/research_projects/diffusion_dpo/train_diffusion_dpo.py b/examples/research_projects/diffusion_dpo/train_diffusion_dpo.py index 3cec037e2544..ab88d4967766 100644 --- a/examples/research_projects/diffusion_dpo/train_diffusion_dpo.py +++ b/examples/research_projects/diffusion_dpo/train_diffusion_dpo.py @@ -49,7 +49,7 @@ DPMSolverMultistepScheduler, UNet2DConditionModel, ) -from diffusers.loaders import LoraLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin from diffusers.optimization import get_scheduler from diffusers.utils import check_min_version, convert_state_dict_to_diffusers from diffusers.utils.import_utils import is_xformers_available @@ -604,7 +604,7 @@ def save_model_hook(models, weights, output_dir): # make sure to pop weight so that corresponding model is not saved again weights.pop() - LoraLoaderMixin.save_lora_weights( + StableDiffusionLoraLoaderMixin.save_lora_weights( output_dir, unet_lora_layers=unet_lora_layers_to_save, text_encoder_lora_layers=None, @@ -621,8 +621,8 @@ def load_model_hook(models, input_dir): else: raise ValueError(f"unexpected save model: {model.__class__}") - lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir) - LoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=unet_) + lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir) + StableDiffusionLoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=unet_) accelerator.register_save_state_pre_hook(save_model_hook) accelerator.register_load_state_pre_hook(load_model_hook) @@ -951,7 +951,7 @@ def collate_fn(examples): unet = unet.to(torch.float32) unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet)) - LoraLoaderMixin.save_lora_weights( + StableDiffusionLoraLoaderMixin.save_lora_weights( save_directory=args.output_dir, unet_lora_layers=unet_lora_state_dict, text_encoder_lora_layers=None ) diff --git a/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py b/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py index 9495658c4904..74f75f7dce35 100644 --- a/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py +++ b/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py @@ -28,7 +28,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from diffusers.image_processor import PipelineImageInput, VaeImageProcessor -from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from diffusers.models.lora import adjust_lora_scale_text_encoder from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel @@ -142,7 +142,9 @@ def retrieve_timesteps( return timesteps, num_inference_steps -class PromptDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin): +class PromptDiffusionPipeline( + DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, FromSingleFileMixin +): r""" Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance. @@ -153,8 +155,8 @@ class PromptDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files Args: @@ -348,7 +350,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -480,7 +482,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora.py b/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora.py index 250f8702d996..663dbbf99473 100644 --- a/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora.py +++ b/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora.py @@ -52,7 +52,7 @@ StableDiffusionPipeline, UNet2DConditionModel, ) -from diffusers.loaders import LoraLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin from diffusers.optimization import get_scheduler from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params from diffusers.utils import ( @@ -999,7 +999,7 @@ def save_model_hook(models, weights, output_dir): # make sure to pop weight so that corresponding model is not saved again weights.pop() - LoraLoaderMixin.save_lora_weights( + StableDiffusionLoraLoaderMixin.save_lora_weights( output_dir, unet_lora_layers=unet_lora_layers_to_save, text_encoder_lora_layers=text_encoder_lora_layers_to_save, @@ -1019,7 +1019,7 @@ def load_model_hook(models, input_dir): else: raise ValueError(f"unexpected save model: {model.__class__}") - lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir) + lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir) unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")} unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict) @@ -1451,7 +1451,7 @@ def compute_text_embeddings(prompt): else: text_encoder_state_dict = None - LoraLoaderMixin.save_lora_weights( + StableDiffusionLoraLoaderMixin.save_lora_weights( save_directory=args.output_dir, unet_lora_layers=unet_lora_state_dict, text_encoder_lora_layers=text_encoder_state_dict, diff --git a/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora_sdxl.py b/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora_sdxl.py index 8af64622028c..d16780131139 100644 --- a/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora_sdxl.py +++ b/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora_sdxl.py @@ -59,7 +59,7 @@ StableDiffusionXLPipeline, UNet2DConditionModel, ) -from diffusers.loaders import LoraLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin from diffusers.optimization import get_scheduler from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr from diffusers.utils import ( @@ -1334,7 +1334,7 @@ def load_model_hook(models, input_dir): else: raise ValueError(f"unexpected save model: {model.__class__}") - lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir) + lora_state_dict, network_alphas = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir) unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")} unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict) diff --git a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py index d7f2dcaa3442..bab86bf21a76 100644 --- a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py +++ b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py @@ -49,7 +49,7 @@ StableDiffusionXLPipeline, UNet2DConditionModel, ) -from diffusers.loaders import LoraLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin from diffusers.optimization import get_scheduler from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr from diffusers.utils import ( @@ -749,7 +749,7 @@ def load_model_hook(models, input_dir): else: raise ValueError(f"unexpected save model: {model.__class__}") - lora_state_dict, _ = LoraLoaderMixin.lora_state_dict(input_dir) + lora_state_dict, _ = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir) unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")} unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict) incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default") diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py index af7eeb805292..416ed943594d 100644 --- a/examples/text_to_image/train_text_to_image_lora_sdxl.py +++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py @@ -50,7 +50,7 @@ StableDiffusionXLPipeline, UNet2DConditionModel, ) -from diffusers.loaders import LoraLoaderMixin +from diffusers.loaders import StableDiffusionLoraLoaderMixin from diffusers.optimization import get_scheduler from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr from diffusers.utils import ( @@ -766,7 +766,7 @@ def load_model_hook(models, input_dir): else: raise ValueError(f"unexpected save model: {model.__class__}") - lora_state_dict, _ = LoraLoaderMixin.lora_state_dict(input_dir) + lora_state_dict, _ = StableDiffusionLoraLoaderMixin.lora_state_dict(input_dir) unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")} unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict) incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default") diff --git a/scripts/convert_animatediff_motion_lora_to_diffusers.py b/scripts/convert_animatediff_motion_lora_to_diffusers.py index c680fdc68462..21567ffa9e7a 100644 --- a/scripts/convert_animatediff_motion_lora_to_diffusers.py +++ b/scripts/convert_animatediff_motion_lora_to_diffusers.py @@ -1,6 +1,8 @@ import argparse +import os import torch +from huggingface_hub import create_repo, upload_folder from safetensors.torch import load_file, save_file @@ -25,8 +27,14 @@ def convert_motion_module(original_state_dict): def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--ckpt_path", type=str, required=True) - parser.add_argument("--output_path", type=str, required=True) + parser.add_argument("--ckpt_path", type=str, required=True, help="Path to checkpoint") + parser.add_argument("--output_path", type=str, required=True, help="Path to output directory") + parser.add_argument( + "--push_to_hub", + action="store_true", + default=False, + help="Whether to push the converted model to the HF or not", + ) return parser.parse_args() @@ -51,4 +59,11 @@ def get_args(): continue output_dict.update({f"unet.{module_name}": params}) - save_file(output_dict, f"{args.output_path}/diffusion_pytorch_model.safetensors") + os.makedirs(args.output_path, exist_ok=True) + + filepath = os.path.join(args.output_path, "diffusion_pytorch_model.safetensors") + save_file(output_dict, filepath) + + if args.push_to_hub: + repo_id = create_repo(args.output_path, exist_ok=True).repo_id + upload_folder(repo_id=repo_id, folder_path=args.output_path, repo_type="model") diff --git a/scripts/convert_animatediff_sparsectrl_to_diffusers.py b/scripts/convert_animatediff_sparsectrl_to_diffusers.py new file mode 100644 index 000000000000..f246dceb97f8 --- /dev/null +++ b/scripts/convert_animatediff_sparsectrl_to_diffusers.py @@ -0,0 +1,83 @@ +import argparse +from typing import Dict + +import torch +import torch.nn as nn + +from diffusers import SparseControlNetModel + + +KEYS_RENAME_MAPPING = { + ".attention_blocks.0": ".attn1", + ".attention_blocks.1": ".attn2", + ".attn1.pos_encoder": ".pos_embed", + ".ff_norm": ".norm3", + ".norms.0": ".norm1", + ".norms.1": ".norm2", + ".temporal_transformer": "", +} + + +def convert(original_state_dict: Dict[str, nn.Module]) -> Dict[str, nn.Module]: + converted_state_dict = {} + + for key in list(original_state_dict.keys()): + renamed_key = key + for new_name, old_name in KEYS_RENAME_MAPPING.items(): + renamed_key = renamed_key.replace(new_name, old_name) + converted_state_dict[renamed_key] = original_state_dict.pop(key) + + return converted_state_dict + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--ckpt_path", type=str, required=True, help="Path to checkpoint") + parser.add_argument("--output_path", type=str, required=True, help="Path to output directory") + parser.add_argument( + "--max_motion_seq_length", + type=int, + default=32, + help="Max motion sequence length supported by the motion adapter", + ) + parser.add_argument( + "--conditioning_channels", type=int, default=4, help="Number of channels in conditioning input to controlnet" + ) + parser.add_argument( + "--use_simplified_condition_embedding", + action="store_true", + default=False, + help="Whether or not to use simplified condition embedding. When `conditioning_channels==4` i.e. latent inputs, set this to `True`. When `conditioning_channels==3` i.e. image inputs, set this to `False`", + ) + parser.add_argument( + "--save_fp16", + action="store_true", + default=False, + help="Whether or not to save model in fp16 precision along with fp32", + ) + parser.add_argument( + "--push_to_hub", action="store_true", default=False, help="Whether or not to push saved model to the HF hub" + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = get_args() + + state_dict = torch.load(args.ckpt_path, map_location="cpu") + if "state_dict" in state_dict.keys(): + state_dict: dict = state_dict["state_dict"] + + controlnet = SparseControlNetModel( + conditioning_channels=args.conditioning_channels, + motion_max_seq_length=args.max_motion_seq_length, + use_simplified_condition_embedding=args.use_simplified_condition_embedding, + ) + + state_dict = convert(state_dict) + controlnet.load_state_dict(state_dict, strict=True) + + controlnet.save_pretrained(args.output_path, push_to_hub=args.push_to_hub) + if args.save_fp16: + controlnet = controlnet.to(dtype=torch.float16) + controlnet.save_pretrained(args.output_path, variant="fp16", push_to_hub=args.push_to_hub) diff --git a/scripts/generate_logits.py b/scripts/generate_logits.py index 89dce0e78d4e..99d46d6628a6 100644 --- a/scripts/generate_logits.py +++ b/scripts/generate_logits.py @@ -103,12 +103,12 @@ models = api.list_models(filter="diffusers") for mod in models: - if "google" in mod.author or mod.modelId == "CompVis/ldm-celebahq-256": - local_checkpoint = "/home/patrick/google_checkpoints/" + mod.modelId.split("/")[-1] + if "google" in mod.author or mod.id == "CompVis/ldm-celebahq-256": + local_checkpoint = "/home/patrick/google_checkpoints/" + mod.id.split("/")[-1] - print(f"Started running {mod.modelId}!!!") + print(f"Started running {mod.id}!!!") - if mod.modelId.startswith("CompVis"): + if mod.id.startswith("CompVis"): model = UNet2DModel.from_pretrained(local_checkpoint, subfolder="unet") else: model = UNet2DModel.from_pretrained(local_checkpoint) @@ -122,6 +122,6 @@ logits = model(noise, time_step).sample assert torch.allclose( - logits[0, 0, 0, :30], results["_".join("_".join(mod.modelId.split("/")).split("-"))], atol=1e-3 + logits[0, 0, 0, :30], results["_".join("_".join(mod.id.split("/")).split("-"))], atol=1e-3 ) - print(f"{mod.modelId} has passed successfully!!!") + print(f"{mod.id} has passed successfully!!!") diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 6a6607cc376f..f42ccc064624 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -99,6 +99,7 @@ "SD3ControlNetModel", "SD3MultiControlNetModel", "SD3Transformer2DModel", + "SparseControlNetModel", "StableCascadeUNet", "T2IAdapter", "T5FilmDecoder", @@ -231,6 +232,7 @@ "AmusedPipeline", "AnimateDiffPipeline", "AnimateDiffSDXLPipeline", + "AnimateDiffSparseControlNetPipeline", "AnimateDiffVideoToVideoPipeline", "AudioLDM2Pipeline", "AudioLDM2ProjectionModel", @@ -533,6 +535,7 @@ SD3ControlNetModel, SD3MultiControlNetModel, SD3Transformer2DModel, + SparseControlNetModel, T2IAdapter, T5FilmDecoder, Transformer2DModel, @@ -645,6 +648,7 @@ AmusedPipeline, AnimateDiffPipeline, AnimateDiffSDXLPipeline, + AnimateDiffSparseControlNetPipeline, AnimateDiffVideoToVideoPipeline, AudioLDM2Pipeline, AudioLDM2ProjectionModel, diff --git a/src/diffusers/commands/env.py b/src/diffusers/commands/env.py index 7a6b598469a8..d0af30bf1c65 100644 --- a/src/diffusers/commands/env.py +++ b/src/diffusers/commands/env.py @@ -24,7 +24,6 @@ is_bitsandbytes_available, is_flax_available, is_google_colab, - is_notebook, is_peft_available, is_safetensors_available, is_torch_available, @@ -107,8 +106,6 @@ def run(self) -> dict: platform_info = platform.platform() - is_notebook_str = "Yes" if is_notebook() else "No" - is_google_colab_str = "Yes" if is_google_colab() else "No" accelerator = "NA" @@ -123,7 +120,7 @@ def run(self) -> dict: out_str = out_str.decode("utf-8") if len(out_str) > 0: - accelerator = out_str.strip() + " VRAM" + accelerator = out_str.strip() except FileNotFoundError: pass elif platform.system() == "Darwin": # Mac OS @@ -155,7 +152,6 @@ def run(self) -> dict: info = { "🤗 Diffusers version": version, "Platform": platform_info, - "Running on a notebook?": is_notebook_str, "Running on Google Colab?": is_google_colab_str, "Python version": platform.python_version(), "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})", diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index 3a8ab3ac3afc..3dccd785cae4 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -310,9 +310,6 @@ def load_config( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -343,7 +340,6 @@ def load_config( local_dir = kwargs.pop("local_dir", None) local_dir_use_symlinks = kwargs.pop("local_dir_use_symlinks", "auto") force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) local_files_only = kwargs.pop("local_files_only", False) @@ -386,7 +382,6 @@ def load_config( cache_dir=cache_dir, force_download=force_download, proxies=proxies, - resume_download=resume_download, local_files_only=local_files_only, token=token, user_agent=user_agent, diff --git a/src/diffusers/loaders/__init__.py b/src/diffusers/loaders/__init__.py index ded53733f052..5db13825c9eb 100644 --- a/src/diffusers/loaders/__init__.py +++ b/src/diffusers/loaders/__init__.py @@ -55,11 +55,18 @@ def text_encoder_attn_modules(text_encoder): if is_torch_available(): _import_structure["single_file_model"] = ["FromOriginalModelMixin"] + _import_structure["unet"] = ["UNet2DConditionLoadersMixin"] _import_structure["utils"] = ["AttnProcsLayers"] if is_transformers_available(): _import_structure["single_file"] = ["FromSingleFileMixin"] - _import_structure["lora"] = ["LoraLoaderMixin", "StableDiffusionXLLoraLoaderMixin", "SD3LoraLoaderMixin"] + _import_structure["lora_pipeline"] = [ + "AmusedLoraLoaderMixin", + "StableDiffusionLoraLoaderMixin", + "SD3LoraLoaderMixin", + "StableDiffusionXLLoraLoaderMixin", + "LoraLoaderMixin", + ] _import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"] _import_structure["ip_adapter"] = ["IPAdapterMixin"] @@ -74,7 +81,13 @@ def text_encoder_attn_modules(text_encoder): if is_transformers_available(): from .ip_adapter import IPAdapterMixin - from .lora import LoraLoaderMixin, SD3LoraLoaderMixin, StableDiffusionXLLoraLoaderMixin + from .lora_pipeline import ( + AmusedLoraLoaderMixin, + LoraLoaderMixin, + SD3LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, + StableDiffusionXLLoraLoaderMixin, + ) from .single_file import FromSingleFileMixin from .textual_inversion import TextualInversionLoaderMixin diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index ef6a53e43196..44c8c0a5181c 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -90,9 +90,7 @@ def load_ip_adapter( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -135,7 +133,6 @@ def load_ip_adapter( # Load the main state dict first. cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) token = kwargs.pop("token", None) @@ -171,7 +168,6 @@ def load_ip_adapter( weights_name=weight_name, cache_dir=cache_dir, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -226,7 +222,8 @@ def load_ip_adapter( # create feature extractor if it has not been registered to the pipeline yet if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None: - feature_extractor = CLIPImageProcessor() + clip_image_size = self.image_encoder.config.image_size + feature_extractor = CLIPImageProcessor(size=clip_image_size, crop_size=clip_image_size) self.register_modules(feature_extractor=feature_extractor) # load ip-adapter into unet @@ -323,7 +320,13 @@ def unload_ip_adapter(self): # remove hidden encoder self.unet.encoder_hid_proj = None - self.config.encoder_hid_dim_type = None + self.unet.config.encoder_hid_dim_type = None + + # Kolors: restore `encoder_hid_proj` with `text_encoder_hid_proj` + if hasattr(self.unet, "text_encoder_hid_proj") and self.unet.text_encoder_hid_proj is not None: + self.unet.encoder_hid_proj = self.unet.text_encoder_hid_proj + self.unet.text_encoder_hid_proj = None + self.unet.config.encoder_hid_dim_type = "text_proj" # restore original Unet attention processors layers attn_procs = {} diff --git a/src/diffusers/loaders/lora_base.py b/src/diffusers/loaders/lora_base.py new file mode 100644 index 000000000000..4b963270427b --- /dev/null +++ b/src/diffusers/loaders/lora_base.py @@ -0,0 +1,752 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import inspect +import os +from pathlib import Path +from typing import Callable, Dict, List, Optional, Union + +import safetensors +import torch +import torch.nn as nn +from huggingface_hub import model_info +from huggingface_hub.constants import HF_HUB_OFFLINE + +from ..models.modeling_utils import ModelMixin, load_state_dict +from ..utils import ( + USE_PEFT_BACKEND, + _get_model_file, + delete_adapter_layers, + deprecate, + is_accelerate_available, + is_peft_available, + is_transformers_available, + logging, + recurse_remove_peft_layers, + set_adapter_layers, + set_weights_and_activate_adapters, +) + + +if is_transformers_available(): + from transformers import PreTrainedModel + +if is_peft_available(): + from peft.tuners.tuners_utils import BaseTunerLayer + +if is_accelerate_available(): + from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module + +logger = logging.get_logger(__name__) + + +def fuse_text_encoder_lora(text_encoder, lora_scale=1.0, safe_fusing=False, adapter_names=None): + """ + Fuses LoRAs for the text encoder. + + Args: + text_encoder (`torch.nn.Module`): + The text encoder module to set the adapter layers for. If `None`, it will try to get the `text_encoder` + attribute. + lora_scale (`float`, defaults to 1.0): + Controls how much to influence the outputs with the LoRA parameters. + safe_fusing (`bool`, defaults to `False`): + Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. + adapter_names (`List[str]` or `str`): + The names of the adapters to use. + """ + merge_kwargs = {"safe_merge": safe_fusing} + + for module in text_encoder.modules(): + if isinstance(module, BaseTunerLayer): + if lora_scale != 1.0: + module.scale_layer(lora_scale) + + # For BC with previous PEFT versions, we need to check the signature + # of the `merge` method to see if it supports the `adapter_names` argument. + supported_merge_kwargs = list(inspect.signature(module.merge).parameters) + if "adapter_names" in supported_merge_kwargs: + merge_kwargs["adapter_names"] = adapter_names + elif "adapter_names" not in supported_merge_kwargs and adapter_names is not None: + raise ValueError( + "The `adapter_names` argument is not supported with your PEFT version. " + "Please upgrade to the latest version of PEFT. `pip install -U peft`" + ) + + module.merge(**merge_kwargs) + + +def unfuse_text_encoder_lora(text_encoder): + """ + Unfuses LoRAs for the text encoder. + + Args: + text_encoder (`torch.nn.Module`): + The text encoder module to set the adapter layers for. If `None`, it will try to get the `text_encoder` + attribute. + """ + for module in text_encoder.modules(): + if isinstance(module, BaseTunerLayer): + module.unmerge() + + +def set_adapters_for_text_encoder( + adapter_names: Union[List[str], str], + text_encoder: Optional["PreTrainedModel"] = None, # noqa: F821 + text_encoder_weights: Optional[Union[float, List[float], List[None]]] = None, +): + """ + Sets the adapter layers for the text encoder. + + Args: + adapter_names (`List[str]` or `str`): + The names of the adapters to use. + text_encoder (`torch.nn.Module`, *optional*): + The text encoder module to set the adapter layers for. If `None`, it will try to get the `text_encoder` + attribute. + text_encoder_weights (`List[float]`, *optional*): + The weights to use for the text encoder. If `None`, the weights are set to `1.0` for all the adapters. + """ + if text_encoder is None: + raise ValueError( + "The pipeline does not have a default `pipe.text_encoder` class. Please make sure to pass a `text_encoder` instead." + ) + + def process_weights(adapter_names, weights): + # Expand weights into a list, one entry per adapter + # e.g. for 2 adapters: 7 -> [7,7] ; [3, None] -> [3, None] + if not isinstance(weights, list): + weights = [weights] * len(adapter_names) + + if len(adapter_names) != len(weights): + raise ValueError( + f"Length of adapter names {len(adapter_names)} is not equal to the length of the weights {len(weights)}" + ) + + # Set None values to default of 1.0 + # e.g. [7,7] -> [7,7] ; [3, None] -> [3,1] + weights = [w if w is not None else 1.0 for w in weights] + + return weights + + adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names + text_encoder_weights = process_weights(adapter_names, text_encoder_weights) + set_weights_and_activate_adapters(text_encoder, adapter_names, text_encoder_weights) + + +def disable_lora_for_text_encoder(text_encoder: Optional["PreTrainedModel"] = None): + """ + Disables the LoRA layers for the text encoder. + + Args: + text_encoder (`torch.nn.Module`, *optional*): + The text encoder module to disable the LoRA layers for. If `None`, it will try to get the `text_encoder` + attribute. + """ + if text_encoder is None: + raise ValueError("Text Encoder not found.") + set_adapter_layers(text_encoder, enabled=False) + + +def enable_lora_for_text_encoder(text_encoder: Optional["PreTrainedModel"] = None): + """ + Enables the LoRA layers for the text encoder. + + Args: + text_encoder (`torch.nn.Module`, *optional*): + The text encoder module to enable the LoRA layers for. If `None`, it will try to get the `text_encoder` + attribute. + """ + if text_encoder is None: + raise ValueError("Text Encoder not found.") + set_adapter_layers(text_encoder, enabled=True) + + +def _remove_text_encoder_monkey_patch(text_encoder): + recurse_remove_peft_layers(text_encoder) + if getattr(text_encoder, "peft_config", None) is not None: + del text_encoder.peft_config + text_encoder._hf_peft_config_loaded = None + + +class LoraBaseMixin: + """Utility class for handling LoRAs.""" + + _lora_loadable_modules = [] + num_fused_loras = 0 + + def load_lora_weights(self, **kwargs): + raise NotImplementedError("`load_lora_weights()` is not implemented.") + + @classmethod + def save_lora_weights(cls, **kwargs): + raise NotImplementedError("`save_lora_weights()` not implemented.") + + @classmethod + def lora_state_dict(cls, **kwargs): + raise NotImplementedError("`lora_state_dict()` is not implemented.") + + @classmethod + def _optionally_disable_offloading(cls, _pipeline): + """ + Optionally removes offloading in case the pipeline has been already sequentially offloaded to CPU. + + Args: + _pipeline (`DiffusionPipeline`): + The pipeline to disable offloading for. + + Returns: + tuple: + A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True. + """ + is_model_cpu_offload = False + is_sequential_cpu_offload = False + + if _pipeline is not None and _pipeline.hf_device_map is None: + for _, component in _pipeline.components.items(): + if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"): + if not is_model_cpu_offload: + is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload) + if not is_sequential_cpu_offload: + is_sequential_cpu_offload = ( + isinstance(component._hf_hook, AlignDevicesHook) + or hasattr(component._hf_hook, "hooks") + and isinstance(component._hf_hook.hooks[0], AlignDevicesHook) + ) + + logger.info( + "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again." + ) + remove_hook_from_module(component, recurse=is_sequential_cpu_offload) + + return (is_model_cpu_offload, is_sequential_cpu_offload) + + @classmethod + def _fetch_state_dict( + cls, + pretrained_model_name_or_path_or_dict, + weight_name, + use_safetensors, + local_files_only, + cache_dir, + force_download, + proxies, + token, + revision, + subfolder, + user_agent, + allow_pickle, + ): + from .lora_pipeline import LORA_WEIGHT_NAME, LORA_WEIGHT_NAME_SAFE + + model_file = None + if not isinstance(pretrained_model_name_or_path_or_dict, dict): + # Let's first try to load .safetensors weights + if (use_safetensors and weight_name is None) or ( + weight_name is not None and weight_name.endswith(".safetensors") + ): + try: + # Here we're relaxing the loading check to enable more Inference API + # friendliness where sometimes, it's not at all possible to automatically + # determine `weight_name`. + if weight_name is None: + weight_name = cls._best_guess_weight_name( + pretrained_model_name_or_path_or_dict, + file_extension=".safetensors", + local_files_only=local_files_only, + ) + model_file = _get_model_file( + pretrained_model_name_or_path_or_dict, + weights_name=weight_name or LORA_WEIGHT_NAME_SAFE, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + user_agent=user_agent, + ) + state_dict = safetensors.torch.load_file(model_file, device="cpu") + except (IOError, safetensors.SafetensorError) as e: + if not allow_pickle: + raise e + # try loading non-safetensors weights + model_file = None + pass + + if model_file is None: + if weight_name is None: + weight_name = cls._best_guess_weight_name( + pretrained_model_name_or_path_or_dict, file_extension=".bin", local_files_only=local_files_only + ) + model_file = _get_model_file( + pretrained_model_name_or_path_or_dict, + weights_name=weight_name or LORA_WEIGHT_NAME, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + user_agent=user_agent, + ) + state_dict = load_state_dict(model_file) + else: + state_dict = pretrained_model_name_or_path_or_dict + + return state_dict + + @classmethod + def _best_guess_weight_name( + cls, pretrained_model_name_or_path_or_dict, file_extension=".safetensors", local_files_only=False + ): + from .lora_pipeline import LORA_WEIGHT_NAME, LORA_WEIGHT_NAME_SAFE + + if local_files_only or HF_HUB_OFFLINE: + raise ValueError("When using the offline mode, you must specify a `weight_name`.") + + targeted_files = [] + + if os.path.isfile(pretrained_model_name_or_path_or_dict): + return + elif os.path.isdir(pretrained_model_name_or_path_or_dict): + targeted_files = [ + f for f in os.listdir(pretrained_model_name_or_path_or_dict) if f.endswith(file_extension) + ] + else: + files_in_repo = model_info(pretrained_model_name_or_path_or_dict).siblings + targeted_files = [f.rfilename for f in files_in_repo if f.rfilename.endswith(file_extension)] + if len(targeted_files) == 0: + return + + # "scheduler" does not correspond to a LoRA checkpoint. + # "optimizer" does not correspond to a LoRA checkpoint + # only top-level checkpoints are considered and not the other ones, hence "checkpoint". + unallowed_substrings = {"scheduler", "optimizer", "checkpoint"} + targeted_files = list( + filter(lambda x: all(substring not in x for substring in unallowed_substrings), targeted_files) + ) + + if any(f.endswith(LORA_WEIGHT_NAME) for f in targeted_files): + targeted_files = list(filter(lambda x: x.endswith(LORA_WEIGHT_NAME), targeted_files)) + elif any(f.endswith(LORA_WEIGHT_NAME_SAFE) for f in targeted_files): + targeted_files = list(filter(lambda x: x.endswith(LORA_WEIGHT_NAME_SAFE), targeted_files)) + + if len(targeted_files) > 1: + raise ValueError( + f"Provided path contains more than one weights file in the {file_extension} format. Either specify `weight_name` in `load_lora_weights` or make sure there's only one `.safetensors` or `.bin` file in {pretrained_model_name_or_path_or_dict}." + ) + weight_name = targeted_files[0] + return weight_name + + def unload_lora_weights(self): + """ + Unloads the LoRA parameters. + + Examples: + + ```python + >>> # Assuming `pipeline` is already loaded with the LoRA parameters. + >>> pipeline.unload_lora_weights() + >>> ... + ``` + """ + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + + for component in self._lora_loadable_modules: + model = getattr(self, component, None) + if model is not None: + if issubclass(model.__class__, ModelMixin): + model.unload_lora() + elif issubclass(model.__class__, PreTrainedModel): + _remove_text_encoder_monkey_patch(model) + + def fuse_lora( + self, + components: List[str] = [], + lora_scale: float = 1.0, + safe_fusing: bool = False, + adapter_names: Optional[List[str]] = None, + **kwargs, + ): + r""" + Fuses the LoRA parameters into the original parameters of the corresponding blocks. + + + + This is an experimental API. + + + + Args: + components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. + lora_scale (`float`, defaults to 1.0): + Controls how much to influence the outputs with the LoRA parameters. + safe_fusing (`bool`, defaults to `False`): + Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. + adapter_names (`List[str]`, *optional*): + Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. + + Example: + + ```py + from diffusers import DiffusionPipeline + import torch + + pipeline = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 + ).to("cuda") + pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") + pipeline.fuse_lora(lora_scale=0.7) + ``` + """ + if "fuse_unet" in kwargs: + depr_message = "Passing `fuse_unet` to `fuse_lora()` is deprecated and will be ignored. Please use the `components` argument and provide a list of the components whose LoRAs are to be fused. `fuse_unet` will be removed in a future version." + deprecate( + "fuse_unet", + "1.0.0", + depr_message, + ) + if "fuse_transformer" in kwargs: + depr_message = "Passing `fuse_transformer` to `fuse_lora()` is deprecated and will be ignored. Please use the `components` argument and provide a list of the components whose LoRAs are to be fused. `fuse_transformer` will be removed in a future version." + deprecate( + "fuse_transformer", + "1.0.0", + depr_message, + ) + if "fuse_text_encoder" in kwargs: + depr_message = "Passing `fuse_text_encoder` to `fuse_lora()` is deprecated and will be ignored. Please use the `components` argument and provide a list of the components whose LoRAs are to be fused. `fuse_text_encoder` will be removed in a future version." + deprecate( + "fuse_text_encoder", + "1.0.0", + depr_message, + ) + + if len(components) == 0: + raise ValueError("`components` cannot be an empty list.") + + for fuse_component in components: + if fuse_component not in self._lora_loadable_modules: + raise ValueError(f"{fuse_component} is not found in {self._lora_loadable_modules=}.") + + model = getattr(self, fuse_component, None) + if model is not None: + # check if diffusers model + if issubclass(model.__class__, ModelMixin): + model.fuse_lora(lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names) + # handle transformers models. + if issubclass(model.__class__, PreTrainedModel): + fuse_text_encoder_lora( + model, lora_scale=lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names + ) + + self.num_fused_loras += 1 + + def unfuse_lora(self, components: List[str] = [], **kwargs): + r""" + Reverses the effect of + [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). + + + + This is an experimental API. + + + + Args: + components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. + unfuse_unet (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + unfuse_text_encoder (`bool`, defaults to `True`): + Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the + LoRA parameters then it won't have any effect. + """ + if "unfuse_unet" in kwargs: + depr_message = "Passing `unfuse_unet` to `unfuse_lora()` is deprecated and will be ignored. Please use the `components` argument. `unfuse_unet` will be removed in a future version." + deprecate( + "unfuse_unet", + "1.0.0", + depr_message, + ) + if "unfuse_transformer" in kwargs: + depr_message = "Passing `unfuse_transformer` to `unfuse_lora()` is deprecated and will be ignored. Please use the `components` argument. `unfuse_transformer` will be removed in a future version." + deprecate( + "unfuse_transformer", + "1.0.0", + depr_message, + ) + if "unfuse_text_encoder" in kwargs: + depr_message = "Passing `unfuse_text_encoder` to `unfuse_lora()` is deprecated and will be ignored. Please use the `components` argument. `unfuse_text_encoder` will be removed in a future version." + deprecate( + "unfuse_text_encoder", + "1.0.0", + depr_message, + ) + + if len(components) == 0: + raise ValueError("`components` cannot be an empty list.") + + for fuse_component in components: + if fuse_component not in self._lora_loadable_modules: + raise ValueError(f"{fuse_component} is not found in {self._lora_loadable_modules=}.") + + model = getattr(self, fuse_component, None) + if model is not None: + if issubclass(model.__class__, (ModelMixin, PreTrainedModel)): + for module in model.modules(): + if isinstance(module, BaseTunerLayer): + module.unmerge() + + self.num_fused_loras -= 1 + + def set_adapters( + self, + adapter_names: Union[List[str], str], + adapter_weights: Optional[Union[float, Dict, List[float], List[Dict]]] = None, + ): + adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names + + adapter_weights = copy.deepcopy(adapter_weights) + + # Expand weights into a list, one entry per adapter + if not isinstance(adapter_weights, list): + adapter_weights = [adapter_weights] * len(adapter_names) + + if len(adapter_names) != len(adapter_weights): + raise ValueError( + f"Length of adapter names {len(adapter_names)} is not equal to the length of the weights {len(adapter_weights)}" + ) + + list_adapters = self.get_list_adapters() # eg {"unet": ["adapter1", "adapter2"], "text_encoder": ["adapter2"]} + all_adapters = { + adapter for adapters in list_adapters.values() for adapter in adapters + } # eg ["adapter1", "adapter2"] + invert_list_adapters = { + adapter: [part for part, adapters in list_adapters.items() if adapter in adapters] + for adapter in all_adapters + } # eg {"adapter1": ["unet"], "adapter2": ["unet", "text_encoder"]} + + # Decompose weights into weights for denoiser and text encoders. + _component_adapter_weights = {} + for component in self._lora_loadable_modules: + model = getattr(self, component) + + for adapter_name, weights in zip(adapter_names, adapter_weights): + if isinstance(weights, dict): + component_adapter_weights = weights.pop(component, None) + + if component_adapter_weights is not None and not hasattr(self, component): + logger.warning( + f"Lora weight dict contains {component} weights but will be ignored because pipeline does not have {component}." + ) + + if component_adapter_weights is not None and component not in invert_list_adapters[adapter_name]: + logger.warning( + ( + f"Lora weight dict for adapter '{adapter_name}' contains {component}," + f"but this will be ignored because {adapter_name} does not contain weights for {component}." + f"Valid parts for {adapter_name} are: {invert_list_adapters[adapter_name]}." + ) + ) + + else: + component_adapter_weights = weights + + _component_adapter_weights.setdefault(component, []) + _component_adapter_weights[component].append(component_adapter_weights) + + if issubclass(model.__class__, ModelMixin): + model.set_adapters(adapter_names, _component_adapter_weights[component]) + elif issubclass(model.__class__, PreTrainedModel): + set_adapters_for_text_encoder(adapter_names, model, _component_adapter_weights[component]) + + def disable_lora(self): + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + + for component in self._lora_loadable_modules: + model = getattr(self, component, None) + if model is not None: + if issubclass(model.__class__, ModelMixin): + model.disable_lora() + elif issubclass(model.__class__, PreTrainedModel): + disable_lora_for_text_encoder(model) + + def enable_lora(self): + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + + for component in self._lora_loadable_modules: + model = getattr(self, component, None) + if model is not None: + if issubclass(model.__class__, ModelMixin): + model.enable_lora() + elif issubclass(model.__class__, PreTrainedModel): + enable_lora_for_text_encoder(model) + + def delete_adapters(self, adapter_names: Union[List[str], str]): + """ + Args: + Deletes the LoRA layers of `adapter_name` for the unet and text-encoder(s). + adapter_names (`Union[List[str], str]`): + The names of the adapter to delete. Can be a single string or a list of strings + """ + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + + if isinstance(adapter_names, str): + adapter_names = [adapter_names] + + for component in self._lora_loadable_modules: + model = getattr(self, component, None) + if model is not None: + if issubclass(model.__class__, ModelMixin): + model.delete_adapters(adapter_names) + elif issubclass(model.__class__, PreTrainedModel): + for adapter_name in adapter_names: + delete_adapter_layers(model, adapter_name) + + def get_active_adapters(self) -> List[str]: + """ + Gets the list of the current active adapters. + + Example: + + ```python + from diffusers import DiffusionPipeline + + pipeline = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + ).to("cuda") + pipeline.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy") + pipeline.get_active_adapters() + ``` + """ + if not USE_PEFT_BACKEND: + raise ValueError( + "PEFT backend is required for this method. Please install the latest version of PEFT `pip install -U peft`" + ) + + active_adapters = [] + + for component in self._lora_loadable_modules: + model = getattr(self, component, None) + if model is not None and issubclass(model.__class__, ModelMixin): + for module in model.modules(): + if isinstance(module, BaseTunerLayer): + active_adapters = module.active_adapters + break + + return active_adapters + + def get_list_adapters(self) -> Dict[str, List[str]]: + """ + Gets the current list of all available adapters in the pipeline. + """ + if not USE_PEFT_BACKEND: + raise ValueError( + "PEFT backend is required for this method. Please install the latest version of PEFT `pip install -U peft`" + ) + + set_adapters = {} + + for component in self._lora_loadable_modules: + model = getattr(self, component, None) + if ( + model is not None + and issubclass(model.__class__, (ModelMixin, PreTrainedModel)) + and hasattr(model, "peft_config") + ): + set_adapters[component] = list(model.peft_config.keys()) + + return set_adapters + + def set_lora_device(self, adapter_names: List[str], device: Union[torch.device, str, int]) -> None: + """ + Moves the LoRAs listed in `adapter_names` to a target device. Useful for offloading the LoRA to the CPU in case + you want to load multiple adapters and free some GPU memory. + + Args: + adapter_names (`List[str]`): + List of adapters to send device to. + device (`Union[torch.device, str, int]`): + Device to send the adapters to. Can be either a torch device, a str or an integer. + """ + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + + for component in self._lora_loadable_modules: + model = getattr(self, component, None) + if model is not None: + for module in model.modules(): + if isinstance(module, BaseTunerLayer): + for adapter_name in adapter_names: + module.lora_A[adapter_name].to(device) + module.lora_B[adapter_name].to(device) + # this is a param, not a module, so device placement is not in-place -> re-assign + if hasattr(module, "lora_magnitude_vector") and module.lora_magnitude_vector is not None: + module.lora_magnitude_vector[adapter_name] = module.lora_magnitude_vector[ + adapter_name + ].to(device) + + @staticmethod + def pack_weights(layers, prefix): + layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers + layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()} + return layers_state_dict + + @staticmethod + def write_lora_layers( + state_dict: Dict[str, torch.Tensor], + save_directory: str, + is_main_process: bool, + weight_name: str, + save_function: Callable, + safe_serialization: bool, + ): + from .lora_pipeline import LORA_WEIGHT_NAME, LORA_WEIGHT_NAME_SAFE + + if os.path.isfile(save_directory): + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") + return + + if save_function is None: + if safe_serialization: + + def save_function(weights, filename): + return safetensors.torch.save_file(weights, filename, metadata={"format": "pt"}) + + else: + save_function = torch.save + + os.makedirs(save_directory, exist_ok=True) + + if weight_name is None: + if safe_serialization: + weight_name = LORA_WEIGHT_NAME_SAFE + else: + weight_name = LORA_WEIGHT_NAME + + save_path = Path(save_directory, weight_name).as_posix() + save_function(state_dict, save_path) + logger.info(f"Model weights saved in {save_path}") + + @property + def lora_scale(self) -> float: + # property function that returns the lora scale which can be set at run time by the pipeline. + # if _lora_scale has not been set, return 1 + return self._lora_scale if hasattr(self, "_lora_scale") else 1.0 diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora_pipeline.py similarity index 53% rename from src/diffusers/loaders/lora.py rename to src/diffusers/loaders/lora_pipeline.py index 539a5637b7be..73273618956a 100644 --- a/src/diffusers/loaders/lora.py +++ b/src/diffusers/loaders/lora_pipeline.py @@ -11,49 +11,32 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy -import inspect import os -from pathlib import Path from typing import Callable, Dict, List, Optional, Union -import safetensors import torch -from huggingface_hub import model_info -from huggingface_hub.constants import HF_HUB_OFFLINE from huggingface_hub.utils import validate_hf_hub_args -from torch import nn -from ..models.modeling_utils import load_state_dict from ..utils import ( USE_PEFT_BACKEND, - _get_model_file, convert_state_dict_to_diffusers, convert_state_dict_to_peft, convert_unet_state_dict_to_peft, - delete_adapter_layers, + deprecate, get_adapter_name, get_peft_kwargs, - is_accelerate_available, is_peft_version, is_transformers_available, logging, - recurse_remove_peft_layers, scale_lora_layers, - set_adapter_layers, - set_weights_and_activate_adapters, ) +from .lora_base import LoraBaseMixin from .lora_conversion_utils import _convert_non_diffusers_lora_to_diffusers, _maybe_map_sgm_blocks_to_diffusers if is_transformers_available(): - from transformers import PreTrainedModel - from ..models.lora import text_encoder_attn_modules, text_encoder_mlp_modules -if is_accelerate_available(): - from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module - logger = logging.get_logger(__name__) TEXT_ENCODER_NAME = "text_encoder" @@ -63,19 +46,16 @@ LORA_WEIGHT_NAME = "pytorch_lora_weights.bin" LORA_WEIGHT_NAME_SAFE = "pytorch_lora_weights.safetensors" -LORA_DEPRECATION_MESSAGE = "You are using an old version of LoRA backend. This will be deprecated in the next releases in favor of PEFT make sure to install the latest PEFT and transformers packages in the future." - -class LoraLoaderMixin: +class StableDiffusionLoraLoaderMixin(LoraBaseMixin): r""" - Load LoRA layers into [`UNet2DConditionModel`] and + Load LoRA layers into Stable Diffusion [`UNet2DConditionModel`] and [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel). """ - text_encoder_name = TEXT_ENCODER_NAME + _lora_loadable_modules = ["unet", "text_encoder"] unet_name = UNET_NAME - transformer_name = TRANSFORMER_NAME - num_fused_loras = 0 + text_encoder_name = TEXT_ENCODER_NAME def load_lora_weights( self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs @@ -86,19 +66,20 @@ def load_lora_weights( All kwargs are forwarded to `self.lora_state_dict`. - See [`~loaders.LoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is + loaded. - See [`~loaders.LoraLoaderMixin.load_lora_into_unet`] for more details on how the state dict is loaded into - `self.unet`. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details on how the state dict is + loaded into `self.unet`. - See [`~loaders.LoraLoaderMixin.load_lora_into_text_encoder`] for more details on how the state dict is loaded - into `self.text_encoder`. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder`] for more details on how the state + dict is loaded into `self.text_encoder`. Parameters: pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.LoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. kwargs (`dict`, *optional*): - See [`~loaders.LoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. adapter_name (`str`, *optional*): Adapter name to be used for referencing the loaded adapter model. If not specified, it will use `default_{i}` where i is the total number of adapters being loaded. @@ -170,9 +151,7 @@ def lora_state_dict( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -194,7 +173,6 @@ def lora_state_dict( # UNet and text encoder or both. cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) token = kwargs.pop("token", None) @@ -214,64 +192,20 @@ def lora_state_dict( "framework": "pytorch", } - model_file = None - if not isinstance(pretrained_model_name_or_path_or_dict, dict): - # Let's first try to load .safetensors weights - if (use_safetensors and weight_name is None) or ( - weight_name is not None and weight_name.endswith(".safetensors") - ): - try: - # Here we're relaxing the loading check to enable more Inference API - # friendliness where sometimes, it's not at all possible to automatically - # determine `weight_name`. - if weight_name is None: - weight_name = cls._best_guess_weight_name( - pretrained_model_name_or_path_or_dict, - file_extension=".safetensors", - local_files_only=local_files_only, - ) - model_file = _get_model_file( - pretrained_model_name_or_path_or_dict, - weights_name=weight_name or LORA_WEIGHT_NAME_SAFE, - cache_dir=cache_dir, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - token=token, - revision=revision, - subfolder=subfolder, - user_agent=user_agent, - ) - state_dict = safetensors.torch.load_file(model_file, device="cpu") - except (IOError, safetensors.SafetensorError) as e: - if not allow_pickle: - raise e - # try loading non-safetensors weights - model_file = None - pass - - if model_file is None: - if weight_name is None: - weight_name = cls._best_guess_weight_name( - pretrained_model_name_or_path_or_dict, file_extension=".bin", local_files_only=local_files_only - ) - model_file = _get_model_file( - pretrained_model_name_or_path_or_dict, - weights_name=weight_name or LORA_WEIGHT_NAME, - cache_dir=cache_dir, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - token=token, - revision=revision, - subfolder=subfolder, - user_agent=user_agent, - ) - state_dict = load_state_dict(model_file) - else: - state_dict = pretrained_model_name_or_path_or_dict + state_dict = cls._fetch_state_dict( + pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict, + weight_name=weight_name, + use_safetensors=use_safetensors, + local_files_only=local_files_only, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + token=token, + revision=revision, + subfolder=subfolder, + user_agent=user_agent, + allow_pickle=allow_pickle, + ) network_alphas = None # TODO: replace it with a method from `state_dict_utils` @@ -292,82 +226,6 @@ def lora_state_dict( return state_dict, network_alphas - @classmethod - def _best_guess_weight_name( - cls, pretrained_model_name_or_path_or_dict, file_extension=".safetensors", local_files_only=False - ): - if local_files_only or HF_HUB_OFFLINE: - raise ValueError("When using the offline mode, you must specify a `weight_name`.") - - targeted_files = [] - - if os.path.isfile(pretrained_model_name_or_path_or_dict): - return - elif os.path.isdir(pretrained_model_name_or_path_or_dict): - targeted_files = [ - f for f in os.listdir(pretrained_model_name_or_path_or_dict) if f.endswith(file_extension) - ] - else: - files_in_repo = model_info(pretrained_model_name_or_path_or_dict).siblings - targeted_files = [f.rfilename for f in files_in_repo if f.rfilename.endswith(file_extension)] - if len(targeted_files) == 0: - return - - # "scheduler" does not correspond to a LoRA checkpoint. - # "optimizer" does not correspond to a LoRA checkpoint - # only top-level checkpoints are considered and not the other ones, hence "checkpoint". - unallowed_substrings = {"scheduler", "optimizer", "checkpoint"} - targeted_files = list( - filter(lambda x: all(substring not in x for substring in unallowed_substrings), targeted_files) - ) - - if any(f.endswith(LORA_WEIGHT_NAME) for f in targeted_files): - targeted_files = list(filter(lambda x: x.endswith(LORA_WEIGHT_NAME), targeted_files)) - elif any(f.endswith(LORA_WEIGHT_NAME_SAFE) for f in targeted_files): - targeted_files = list(filter(lambda x: x.endswith(LORA_WEIGHT_NAME_SAFE), targeted_files)) - - if len(targeted_files) > 1: - raise ValueError( - f"Provided path contains more than one weights file in the {file_extension} format. Either specify `weight_name` in `load_lora_weights` or make sure there's only one `.safetensors` or `.bin` file in {pretrained_model_name_or_path_or_dict}." - ) - weight_name = targeted_files[0] - return weight_name - - @classmethod - def _optionally_disable_offloading(cls, _pipeline): - """ - Optionally removes offloading in case the pipeline has been already sequentially offloaded to CPU. - - Args: - _pipeline (`DiffusionPipeline`): - The pipeline to disable offloading for. - - Returns: - tuple: - A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True. - """ - is_model_cpu_offload = False - is_sequential_cpu_offload = False - - if _pipeline is not None and _pipeline.hf_device_map is None: - for _, component in _pipeline.components.items(): - if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"): - if not is_model_cpu_offload: - is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload) - if not is_sequential_cpu_offload: - is_sequential_cpu_offload = ( - isinstance(component._hf_hook, AlignDevicesHook) - or hasattr(component._hf_hook, "hooks") - and isinstance(component._hf_hook.hooks[0], AlignDevicesHook) - ) - - logger.info( - "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again." - ) - remove_hook_from_module(component, recurse=is_sequential_cpu_offload) - - return (is_model_cpu_offload, is_sequential_cpu_offload) - @classmethod def load_lora_into_unet(cls, state_dict, network_alphas, unet, adapter_name=None, _pipeline=None): """ @@ -521,115 +379,12 @@ def load_lora_into_text_encoder( _pipeline.enable_sequential_cpu_offload() # Unsafe code /> - @classmethod - def load_lora_into_transformer(cls, state_dict, network_alphas, transformer, adapter_name=None, _pipeline=None): - """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - network_alphas (`Dict[str, float]`): - See `LoRALinearLayer` for more details. - unet (`UNet2DConditionModel`): - The UNet model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - """ - from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict - - keys = list(state_dict.keys()) - - transformer_keys = [k for k in keys if k.startswith(cls.transformer_name)] - state_dict = { - k.replace(f"{cls.transformer_name}.", ""): v for k, v in state_dict.items() if k in transformer_keys - } - - if network_alphas is not None: - alpha_keys = [k for k in network_alphas.keys() if k.startswith(cls.transformer_name)] - network_alphas = { - k.replace(f"{cls.transformer_name}.", ""): v for k, v in network_alphas.items() if k in alpha_keys - } - - if len(state_dict.keys()) > 0: - if adapter_name in getattr(transformer, "peft_config", {}): - raise ValueError( - f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name." - ) - - rank = {} - for key, val in state_dict.items(): - if "lora_B" in key: - rank[key] = val.shape[1] - - lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict) - if "use_dora" in lora_config_kwargs: - if lora_config_kwargs["use_dora"] and is_peft_version("<", "0.9.0"): - raise ValueError( - "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`." - ) - else: - lora_config_kwargs.pop("use_dora") - lora_config = LoraConfig(**lora_config_kwargs) - - # adapter_name - if adapter_name is None: - adapter_name = get_adapter_name(transformer) - - # In case the pipeline has been already offloaded to CPU - temporarily remove the hooks - # otherwise loading LoRA weights will lead to an error - is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline) - - inject_adapter_in_model(lora_config, transformer, adapter_name=adapter_name) - incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name) - - if incompatible_keys is not None: - # check only for unexpected keys - unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None) - if unexpected_keys: - logger.warning( - f"Loading adapter weights from state_dict led to unexpected keys not found in the model: " - f" {unexpected_keys}. " - ) - - # Offload back. - if is_model_cpu_offload: - _pipeline.enable_model_cpu_offload() - elif is_sequential_cpu_offload: - _pipeline.enable_sequential_cpu_offload() - # Unsafe code /> - - @property - def lora_scale(self) -> float: - # property function that returns the lora scale which can be set at run time by the pipeline. - # if _lora_scale has not been set, return 1 - return self._lora_scale if hasattr(self, "_lora_scale") else 1.0 - - def _remove_text_encoder_monkey_patch(self): - remove_method = recurse_remove_peft_layers - if hasattr(self, "text_encoder"): - remove_method(self.text_encoder) - # In case text encoder have no Lora attached - if getattr(self.text_encoder, "peft_config", None) is not None: - del self.text_encoder.peft_config - self.text_encoder._hf_peft_config_loaded = None - - if hasattr(self, "text_encoder_2"): - remove_method(self.text_encoder_2) - if getattr(self.text_encoder_2, "peft_config", None) is not None: - del self.text_encoder_2.peft_config - self.text_encoder_2._hf_peft_config_loaded = None - @classmethod def save_lora_weights( cls, save_directory: Union[str, os.PathLike], unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, text_encoder_lora_layers: Dict[str, torch.nn.Module] = None, - transformer_lora_layers: Dict[str, torch.nn.Module] = None, is_main_process: bool = True, weight_name: str = None, save_function: Callable = None, @@ -659,24 +414,14 @@ def save_lora_weights( """ state_dict = {} - def pack_weights(layers, prefix): - layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers - layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()} - return layers_state_dict - - if not (unet_lora_layers or text_encoder_lora_layers or transformer_lora_layers): - raise ValueError( - "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers`, or `transformer_lora_layers`." - ) + if not (unet_lora_layers or text_encoder_lora_layers): + raise ValueError("You must pass at least one of `unet_lora_layers` and `text_encoder_lora_layers`.") if unet_lora_layers: - state_dict.update(pack_weights(unet_lora_layers, cls.unet_name)) + state_dict.update(cls.pack_weights(unet_lora_layers, cls.unet_name)) if text_encoder_lora_layers: - state_dict.update(pack_weights(text_encoder_lora_layers, cls.text_encoder_name)) - - if transformer_lora_layers: - state_dict.update(pack_weights(transformer_lora_layers, "transformer")) + state_dict.update(cls.pack_weights(text_encoder_lora_layers, cls.text_encoder_name)) # Save the model cls.write_lora_layers( @@ -688,68 +433,13 @@ def pack_weights(layers, prefix): safe_serialization=safe_serialization, ) - @staticmethod - def write_lora_layers( - state_dict: Dict[str, torch.Tensor], - save_directory: str, - is_main_process: bool, - weight_name: str, - save_function: Callable, - safe_serialization: bool, - ): - if os.path.isfile(save_directory): - logger.error(f"Provided path ({save_directory}) should be a directory, not a file") - return - - if save_function is None: - if safe_serialization: - - def save_function(weights, filename): - return safetensors.torch.save_file(weights, filename, metadata={"format": "pt"}) - - else: - save_function = torch.save - - os.makedirs(save_directory, exist_ok=True) - - if weight_name is None: - if safe_serialization: - weight_name = LORA_WEIGHT_NAME_SAFE - else: - weight_name = LORA_WEIGHT_NAME - - save_path = Path(save_directory, weight_name).as_posix() - save_function(state_dict, save_path) - logger.info(f"Model weights saved in {save_path}") - - def unload_lora_weights(self): - """ - Unloads the LoRA parameters. - - Examples: - - ```python - >>> # Assuming `pipeline` is already loaded with the LoRA parameters. - >>> pipeline.unload_lora_weights() - >>> ... - ``` - """ - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for this method.") - - unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet - unet.unload_lora() - - # Safe to call the following regardless of LoRA. - self._remove_text_encoder_monkey_patch() - def fuse_lora( self, - fuse_unet: bool = True, - fuse_text_encoder: bool = True, + components: List[str] = ["unet", "text_encoder"], lora_scale: float = 1.0, safe_fusing: bool = False, adapter_names: Optional[List[str]] = None, + **kwargs, ): r""" Fuses the LoRA parameters into the original parameters of the corresponding blocks. @@ -761,10 +451,7 @@ def fuse_lora( Args: - fuse_unet (`bool`, defaults to `True`): Whether to fuse the UNet LoRA parameters. - fuse_text_encoder (`bool`, defaults to `True`): - Whether to fuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the - LoRA parameters then it won't have any effect. + components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. lora_scale (`float`, defaults to 1.0): Controls how much to influence the outputs with the LoRA parameters. safe_fusing (`bool`, defaults to `False`): @@ -785,50 +472,14 @@ def fuse_lora( pipeline.fuse_lora(lora_scale=0.7) ``` """ - from peft.tuners.tuners_utils import BaseTunerLayer - - if fuse_unet or fuse_text_encoder: - self.num_fused_loras += 1 - if self.num_fused_loras > 1: - logger.warning( - "The current API is supported for operating with a single LoRA file. You are trying to load and fuse more than one LoRA which is not well-supported.", - ) + super().fuse_lora( + components=components, lora_scale=lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names + ) - if fuse_unet: - unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet - unet.fuse_lora(lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names) - - def fuse_text_encoder_lora(text_encoder, lora_scale=1.0, safe_fusing=False, adapter_names=None): - merge_kwargs = {"safe_merge": safe_fusing} - - for module in text_encoder.modules(): - if isinstance(module, BaseTunerLayer): - if lora_scale != 1.0: - module.scale_layer(lora_scale) - - # For BC with previous PEFT versions, we need to check the signature - # of the `merge` method to see if it supports the `adapter_names` argument. - supported_merge_kwargs = list(inspect.signature(module.merge).parameters) - if "adapter_names" in supported_merge_kwargs: - merge_kwargs["adapter_names"] = adapter_names - elif "adapter_names" not in supported_merge_kwargs and adapter_names is not None: - raise ValueError( - "The `adapter_names` argument is not supported with your PEFT version. " - "Please upgrade to the latest version of PEFT. `pip install -U peft`" - ) - - module.merge(**merge_kwargs) - - if fuse_text_encoder: - if hasattr(self, "text_encoder"): - fuse_text_encoder_lora(self.text_encoder, lora_scale, safe_fusing, adapter_names=adapter_names) - if hasattr(self, "text_encoder_2"): - fuse_text_encoder_lora(self.text_encoder_2, lora_scale, safe_fusing, adapter_names=adapter_names) - - def unfuse_lora(self, unfuse_unet: bool = True, unfuse_text_encoder: bool = True): + def unfuse_lora(self, components: List[str] = ["unet", "text_encoder"], **kwargs): r""" Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.fuse_lora). + [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). @@ -837,352 +488,26 @@ def unfuse_lora(self, unfuse_unet: bool = True, unfuse_text_encoder: bool = True Args: + components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. unfuse_unet (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. unfuse_text_encoder (`bool`, defaults to `True`): Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the LoRA parameters then it won't have any effect. """ - from peft.tuners.tuners_utils import BaseTunerLayer - - unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet - if unfuse_unet: - for module in unet.modules(): - if isinstance(module, BaseTunerLayer): - module.unmerge() - - def unfuse_text_encoder_lora(text_encoder): - for module in text_encoder.modules(): - if isinstance(module, BaseTunerLayer): - module.unmerge() - - if unfuse_text_encoder: - if hasattr(self, "text_encoder"): - unfuse_text_encoder_lora(self.text_encoder) - if hasattr(self, "text_encoder_2"): - unfuse_text_encoder_lora(self.text_encoder_2) - - self.num_fused_loras -= 1 - - def set_adapters_for_text_encoder( - self, - adapter_names: Union[List[str], str], - text_encoder: Optional["PreTrainedModel"] = None, # noqa: F821 - text_encoder_weights: Optional[Union[float, List[float], List[None]]] = None, - ): - """ - Sets the adapter layers for the text encoder. - - Args: - adapter_names (`List[str]` or `str`): - The names of the adapters to use. - text_encoder (`torch.nn.Module`, *optional*): - The text encoder module to set the adapter layers for. If `None`, it will try to get the `text_encoder` - attribute. - text_encoder_weights (`List[float]`, *optional*): - The weights to use for the text encoder. If `None`, the weights are set to `1.0` for all the adapters. - """ - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for this method.") - - def process_weights(adapter_names, weights): - # Expand weights into a list, one entry per adapter - # e.g. for 2 adapters: 7 -> [7,7] ; [3, None] -> [3, None] - if not isinstance(weights, list): - weights = [weights] * len(adapter_names) - - if len(adapter_names) != len(weights): - raise ValueError( - f"Length of adapter names {len(adapter_names)} is not equal to the length of the weights {len(weights)}" - ) - - # Set None values to default of 1.0 - # e.g. [7,7] -> [7,7] ; [3, None] -> [3,1] - weights = [w if w is not None else 1.0 for w in weights] - - return weights - - adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names - text_encoder_weights = process_weights(adapter_names, text_encoder_weights) - text_encoder = text_encoder or getattr(self, "text_encoder", None) - if text_encoder is None: - raise ValueError( - "The pipeline does not have a default `pipe.text_encoder` class. Please make sure to pass a `text_encoder` instead." - ) - set_weights_and_activate_adapters(text_encoder, adapter_names, text_encoder_weights) - - def disable_lora_for_text_encoder(self, text_encoder: Optional["PreTrainedModel"] = None): - """ - Disables the LoRA layers for the text encoder. - - Args: - text_encoder (`torch.nn.Module`, *optional*): - The text encoder module to disable the LoRA layers for. If `None`, it will try to get the - `text_encoder` attribute. - """ - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for this method.") - - text_encoder = text_encoder or getattr(self, "text_encoder", None) - if text_encoder is None: - raise ValueError("Text Encoder not found.") - set_adapter_layers(text_encoder, enabled=False) - - def enable_lora_for_text_encoder(self, text_encoder: Optional["PreTrainedModel"] = None): - """ - Enables the LoRA layers for the text encoder. - - Args: - text_encoder (`torch.nn.Module`, *optional*): - The text encoder module to enable the LoRA layers for. If `None`, it will try to get the `text_encoder` - attribute. - """ - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for this method.") - text_encoder = text_encoder or getattr(self, "text_encoder", None) - if text_encoder is None: - raise ValueError("Text Encoder not found.") - set_adapter_layers(self.text_encoder, enabled=True) - - def set_adapters( - self, - adapter_names: Union[List[str], str], - adapter_weights: Optional[Union[float, Dict, List[float], List[Dict]]] = None, - ): - adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names - - adapter_weights = copy.deepcopy(adapter_weights) - - # Expand weights into a list, one entry per adapter - if not isinstance(adapter_weights, list): - adapter_weights = [adapter_weights] * len(adapter_names) - - if len(adapter_names) != len(adapter_weights): - raise ValueError( - f"Length of adapter names {len(adapter_names)} is not equal to the length of the weights {len(adapter_weights)}" - ) - - # Decompose weights into weights for unet, text_encoder and text_encoder_2 - unet_lora_weights, text_encoder_lora_weights, text_encoder_2_lora_weights = [], [], [] - - list_adapters = self.get_list_adapters() # eg {"unet": ["adapter1", "adapter2"], "text_encoder": ["adapter2"]} - all_adapters = { - adapter for adapters in list_adapters.values() for adapter in adapters - } # eg ["adapter1", "adapter2"] - invert_list_adapters = { - adapter: [part for part, adapters in list_adapters.items() if adapter in adapters] - for adapter in all_adapters - } # eg {"adapter1": ["unet"], "adapter2": ["unet", "text_encoder"]} - - for adapter_name, weights in zip(adapter_names, adapter_weights): - if isinstance(weights, dict): - unet_lora_weight = weights.pop("unet", None) - text_encoder_lora_weight = weights.pop("text_encoder", None) - text_encoder_2_lora_weight = weights.pop("text_encoder_2", None) - - if len(weights) > 0: - raise ValueError( - f"Got invalid key '{weights.keys()}' in lora weight dict for adapter {adapter_name}." - ) - - if text_encoder_2_lora_weight is not None and not hasattr(self, "text_encoder_2"): - logger.warning( - "Lora weight dict contains text_encoder_2 weights but will be ignored because pipeline does not have text_encoder_2." - ) - - # warn if adapter doesn't have parts specified by adapter_weights - for part_weight, part_name in zip( - [unet_lora_weight, text_encoder_lora_weight, text_encoder_2_lora_weight], - ["unet", "text_encoder", "text_encoder_2"], - ): - if part_weight is not None and part_name not in invert_list_adapters[adapter_name]: - logger.warning( - f"Lora weight dict for adapter '{adapter_name}' contains {part_name}, but this will be ignored because {adapter_name} does not contain weights for {part_name}. Valid parts for {adapter_name} are: {invert_list_adapters[adapter_name]}." - ) - - else: - unet_lora_weight = weights - text_encoder_lora_weight = weights - text_encoder_2_lora_weight = weights - - unet_lora_weights.append(unet_lora_weight) - text_encoder_lora_weights.append(text_encoder_lora_weight) - text_encoder_2_lora_weights.append(text_encoder_2_lora_weight) - - unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet - # Handle the UNET - unet.set_adapters(adapter_names, unet_lora_weights) - - # Handle the Text Encoder - if hasattr(self, "text_encoder"): - self.set_adapters_for_text_encoder(adapter_names, self.text_encoder, text_encoder_lora_weights) - if hasattr(self, "text_encoder_2"): - self.set_adapters_for_text_encoder(adapter_names, self.text_encoder_2, text_encoder_2_lora_weights) - - def disable_lora(self): - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for this method.") - - # Disable unet adapters - unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet - unet.disable_lora() - - # Disable text encoder adapters - if hasattr(self, "text_encoder"): - self.disable_lora_for_text_encoder(self.text_encoder) - if hasattr(self, "text_encoder_2"): - self.disable_lora_for_text_encoder(self.text_encoder_2) - - def enable_lora(self): - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for this method.") - - # Enable unet adapters - unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet - unet.enable_lora() - - # Enable text encoder adapters - if hasattr(self, "text_encoder"): - self.enable_lora_for_text_encoder(self.text_encoder) - if hasattr(self, "text_encoder_2"): - self.enable_lora_for_text_encoder(self.text_encoder_2) - - def delete_adapters(self, adapter_names: Union[List[str], str]): - """ - Args: - Deletes the LoRA layers of `adapter_name` for the unet and text-encoder(s). - adapter_names (`Union[List[str], str]`): - The names of the adapter to delete. Can be a single string or a list of strings - """ - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for this method.") - - if isinstance(adapter_names, str): - adapter_names = [adapter_names] - - # Delete unet adapters - unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet - unet.delete_adapters(adapter_names) - - for adapter_name in adapter_names: - # Delete text encoder adapters - if hasattr(self, "text_encoder"): - delete_adapter_layers(self.text_encoder, adapter_name) - if hasattr(self, "text_encoder_2"): - delete_adapter_layers(self.text_encoder_2, adapter_name) - - def get_active_adapters(self) -> List[str]: - """ - Gets the list of the current active adapters. - - Example: - - ```python - from diffusers import DiffusionPipeline - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", - ).to("cuda") - pipeline.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy") - pipeline.get_active_adapters() - ``` - """ - if not USE_PEFT_BACKEND: - raise ValueError( - "PEFT backend is required for this method. Please install the latest version of PEFT `pip install -U peft`" - ) - - from peft.tuners.tuners_utils import BaseTunerLayer - - active_adapters = [] - unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet - for module in unet.modules(): - if isinstance(module, BaseTunerLayer): - active_adapters = module.active_adapters - break - - return active_adapters - - def get_list_adapters(self) -> Dict[str, List[str]]: - """ - Gets the current list of all available adapters in the pipeline. - """ - if not USE_PEFT_BACKEND: - raise ValueError( - "PEFT backend is required for this method. Please install the latest version of PEFT `pip install -U peft`" - ) - - set_adapters = {} - - if hasattr(self, "text_encoder") and hasattr(self.text_encoder, "peft_config"): - set_adapters["text_encoder"] = list(self.text_encoder.peft_config.keys()) - - if hasattr(self, "text_encoder_2") and hasattr(self.text_encoder_2, "peft_config"): - set_adapters["text_encoder_2"] = list(self.text_encoder_2.peft_config.keys()) + super().unfuse_lora(components=components) - unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet - if hasattr(self, self.unet_name) and hasattr(unet, "peft_config"): - set_adapters[self.unet_name] = list(self.unet.peft_config.keys()) - return set_adapters - - def set_lora_device(self, adapter_names: List[str], device: Union[torch.device, str, int]) -> None: - """ - Moves the LoRAs listed in `adapter_names` to a target device. Useful for offloading the LoRA to the CPU in case - you want to load multiple adapters and free some GPU memory. +class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin): + r""" + Load LoRA layers into Stable Diffusion XL [`UNet2DConditionModel`], + [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), and + [`CLIPTextModelWithProjection`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection). + """ - Args: - adapter_names (`List[str]`): - List of adapters to send device to. - device (`Union[torch.device, str, int]`): - Device to send the adapters to. Can be either a torch device, a str or an integer. - """ - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for this method.") + _lora_loadable_modules = ["unet", "text_encoder", "text_encoder_2"] + unet_name = UNET_NAME + text_encoder_name = TEXT_ENCODER_NAME - from peft.tuners.tuners_utils import BaseTunerLayer - - # Handle the UNET - unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet - for unet_module in unet.modules(): - if isinstance(unet_module, BaseTunerLayer): - for adapter_name in adapter_names: - unet_module.lora_A[adapter_name].to(device) - unet_module.lora_B[adapter_name].to(device) - # this is a param, not a module, so device placement is not in-place -> re-assign - if hasattr(unet_module, "lora_magnitude_vector") and unet_module.lora_magnitude_vector is not None: - unet_module.lora_magnitude_vector[adapter_name] = unet_module.lora_magnitude_vector[ - adapter_name - ].to(device) - - # Handle the text encoder - modules_to_process = [] - if hasattr(self, "text_encoder"): - modules_to_process.append(self.text_encoder) - - if hasattr(self, "text_encoder_2"): - modules_to_process.append(self.text_encoder_2) - - for text_encoder in modules_to_process: - # loop over submodules - for text_encoder_module in text_encoder.modules(): - if isinstance(text_encoder_module, BaseTunerLayer): - for adapter_name in adapter_names: - text_encoder_module.lora_A[adapter_name].to(device) - text_encoder_module.lora_B[adapter_name].to(device) - # this is a param, not a module, so device placement is not in-place -> re-assign - if ( - hasattr(text_encoder_module, "lora_magnitude_vector") - and text_encoder_module.lora_magnitude_vector is not None - ): - text_encoder_module.lora_magnitude_vector[ - adapter_name - ] = text_encoder_module.lora_magnitude_vector[adapter_name].to(device) - - -class StableDiffusionXLLoraLoaderMixin(LoraLoaderMixin): - """This class overrides `LoraLoaderMixin` with LoRA loading/saving code that's specific to SDXL""" - - # Override to properly handle the loading and unloading of the additional text encoder. def load_lora_weights( self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], @@ -1195,22 +520,23 @@ def load_lora_weights( All kwargs are forwarded to `self.lora_state_dict`. - See [`~loaders.LoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is + loaded. - See [`~loaders.LoraLoaderMixin.load_lora_into_unet`] for more details on how the state dict is loaded into - `self.unet`. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details on how the state dict is + loaded into `self.unet`. - See [`~loaders.LoraLoaderMixin.load_lora_into_text_encoder`] for more details on how the state dict is loaded - into `self.text_encoder`. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder`] for more details on how the state + dict is loaded into `self.text_encoder`. Parameters: pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.LoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. adapter_name (`str`, *optional*): Adapter name to be used for referencing the loaded adapter model. If not specified, it will use `default_{i}` where i is the total number of adapters being loaded. kwargs (`dict`, *optional*): - See [`~loaders.LoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -1261,15 +587,777 @@ def load_lora_weights( ) @classmethod - def save_lora_weights( + @validate_hf_hub_args + # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.lora_state_dict + def lora_state_dict( cls, - save_directory: Union[str, os.PathLike], - unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, - text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, - text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, - is_main_process: bool = True, - weight_name: str = None, - save_function: Callable = None, + pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], + **kwargs, + ): + r""" + Return state dict for lora weights and the network alphas. + + + + We support loading A1111 formatted LoRA checkpoints in a limited capacity. + + This function is experimental and might change in the future. + + + + Parameters: + pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): + Can be either: + + - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on + the Hub. + - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved + with [`ModelMixin.save_pretrained`]. + - A [torch state + dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). + + cache_dir (`Union[str, os.PathLike]`, *optional*): + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + local_files_only (`bool`, *optional*, defaults to `False`): + Whether to only load local model weights and configuration files or not. If set to `True`, the model + won't be downloaded from the Hub. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. + subfolder (`str`, *optional*, defaults to `""`): + The subfolder location of a model file within a larger model repository on the Hub or locally. + weight_name (`str`, *optional*, defaults to None): + Name of the serialized state dict file. + """ + # Load the main state dict first which has the LoRA layers for either of + # UNet and text encoder or both. + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", None) + token = kwargs.pop("token", None) + revision = kwargs.pop("revision", None) + subfolder = kwargs.pop("subfolder", None) + weight_name = kwargs.pop("weight_name", None) + unet_config = kwargs.pop("unet_config", None) + use_safetensors = kwargs.pop("use_safetensors", None) + + allow_pickle = False + if use_safetensors is None: + use_safetensors = True + allow_pickle = True + + user_agent = { + "file_type": "attn_procs_weights", + "framework": "pytorch", + } + + state_dict = cls._fetch_state_dict( + pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict, + weight_name=weight_name, + use_safetensors=use_safetensors, + local_files_only=local_files_only, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + token=token, + revision=revision, + subfolder=subfolder, + user_agent=user_agent, + allow_pickle=allow_pickle, + ) + + network_alphas = None + # TODO: replace it with a method from `state_dict_utils` + if all( + ( + k.startswith("lora_te_") + or k.startswith("lora_unet_") + or k.startswith("lora_te1_") + or k.startswith("lora_te2_") + ) + for k in state_dict.keys() + ): + # Map SDXL blocks correctly. + if unet_config is not None: + # use unet config to remap block numbers + state_dict = _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config) + state_dict, network_alphas = _convert_non_diffusers_lora_to_diffusers(state_dict) + + return state_dict, network_alphas + + @classmethod + # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.load_lora_into_unet + def load_lora_into_unet(cls, state_dict, network_alphas, unet, adapter_name=None, _pipeline=None): + """ + This will load the LoRA layers specified in `state_dict` into `unet`. + + Parameters: + state_dict (`dict`): + A standard state dict containing the lora layer parameters. The keys can either be indexed directly + into the unet or prefixed with an additional `unet` which can be used to distinguish between text + encoder lora layers. + network_alphas (`Dict[str, float]`): + The value of the network alpha used for stable learning and preventing underflow. This value has the + same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this + link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning). + unet (`UNet2DConditionModel`): + The UNet model to load the LoRA layers into. + adapter_name (`str`, *optional*): + Adapter name to be used for referencing the loaded adapter model. If not specified, it will use + `default_{i}` where i is the total number of adapters being loaded. + """ + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + + # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918), + # then the `state_dict` keys should have `cls.unet_name` and/or `cls.text_encoder_name` as + # their prefixes. + keys = list(state_dict.keys()) + only_text_encoder = all(key.startswith(cls.text_encoder_name) for key in keys) + if not only_text_encoder: + # Load the layers corresponding to UNet. + logger.info(f"Loading {cls.unet_name}.") + unet.load_attn_procs( + state_dict, network_alphas=network_alphas, adapter_name=adapter_name, _pipeline=_pipeline + ) + + @classmethod + # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder + def load_lora_into_text_encoder( + cls, + state_dict, + network_alphas, + text_encoder, + prefix=None, + lora_scale=1.0, + adapter_name=None, + _pipeline=None, + ): + """ + This will load the LoRA layers specified in `state_dict` into `text_encoder` + + Parameters: + state_dict (`dict`): + A standard state dict containing the lora layer parameters. The key should be prefixed with an + additional `text_encoder` to distinguish between unet lora layers. + network_alphas (`Dict[str, float]`): + See `LoRALinearLayer` for more details. + text_encoder (`CLIPTextModel`): + The text encoder model to load the LoRA layers into. + prefix (`str`): + Expected prefix of the `text_encoder` in the `state_dict`. + lora_scale (`float`): + How much to scale the output of the lora linear layer before it is added with the output of the regular + lora layer. + adapter_name (`str`, *optional*): + Adapter name to be used for referencing the loaded adapter model. If not specified, it will use + `default_{i}` where i is the total number of adapters being loaded. + """ + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + + from peft import LoraConfig + + # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918), + # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as + # their prefixes. + keys = list(state_dict.keys()) + prefix = cls.text_encoder_name if prefix is None else prefix + + # Safe prefix to check with. + if any(cls.text_encoder_name in key for key in keys): + # Load the layers corresponding to text encoder and make necessary adjustments. + text_encoder_keys = [k for k in keys if k.startswith(prefix) and k.split(".")[0] == prefix] + text_encoder_lora_state_dict = { + k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys + } + + if len(text_encoder_lora_state_dict) > 0: + logger.info(f"Loading {prefix}.") + rank = {} + text_encoder_lora_state_dict = convert_state_dict_to_diffusers(text_encoder_lora_state_dict) + + # convert state dict + text_encoder_lora_state_dict = convert_state_dict_to_peft(text_encoder_lora_state_dict) + + for name, _ in text_encoder_attn_modules(text_encoder): + for module in ("out_proj", "q_proj", "k_proj", "v_proj"): + rank_key = f"{name}.{module}.lora_B.weight" + if rank_key not in text_encoder_lora_state_dict: + continue + rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1] + + for name, _ in text_encoder_mlp_modules(text_encoder): + for module in ("fc1", "fc2"): + rank_key = f"{name}.{module}.lora_B.weight" + if rank_key not in text_encoder_lora_state_dict: + continue + rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1] + + if network_alphas is not None: + alpha_keys = [ + k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix + ] + network_alphas = { + k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys + } + + lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False) + if "use_dora" in lora_config_kwargs: + if lora_config_kwargs["use_dora"]: + if is_peft_version("<", "0.9.0"): + raise ValueError( + "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`." + ) + else: + if is_peft_version("<", "0.9.0"): + lora_config_kwargs.pop("use_dora") + lora_config = LoraConfig(**lora_config_kwargs) + + # adapter_name + if adapter_name is None: + adapter_name = get_adapter_name(text_encoder) + + is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline) + + # inject LoRA layers and load the state dict + # in transformers we automatically check whether the adapter name is already in use or not + text_encoder.load_adapter( + adapter_name=adapter_name, + adapter_state_dict=text_encoder_lora_state_dict, + peft_config=lora_config, + ) + + # scale LoRA layers with `lora_scale` + scale_lora_layers(text_encoder, weight=lora_scale) + + text_encoder.to(device=text_encoder.device, dtype=text_encoder.dtype) + + # Offload back. + if is_model_cpu_offload: + _pipeline.enable_model_cpu_offload() + elif is_sequential_cpu_offload: + _pipeline.enable_sequential_cpu_offload() + # Unsafe code /> + + @classmethod + def save_lora_weights( + cls, + save_directory: Union[str, os.PathLike], + unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, + text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, + text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, + is_main_process: bool = True, + weight_name: str = None, + save_function: Callable = None, + safe_serialization: bool = True, + ): + r""" + Save the LoRA parameters corresponding to the UNet and text encoder. + + Arguments: + save_directory (`str` or `os.PathLike`): + Directory to save LoRA parameters to. Will be created if it doesn't exist. + unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): + State dict of the LoRA layers corresponding to the `unet`. + text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): + State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text + encoder LoRA state dict because it comes from 🤗 Transformers. + text_encoder_2_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): + State dict of the LoRA layers corresponding to the `text_encoder_2`. Must explicitly pass the text + encoder LoRA state dict because it comes from 🤗 Transformers. + is_main_process (`bool`, *optional*, defaults to `True`): + Whether the process calling this is the main process or not. Useful during distributed training and you + need to call this function on all processes. In this case, set `is_main_process=True` only on the main + process to avoid race conditions. + save_function (`Callable`): + The function to use to save the state dictionary. Useful during distributed training when you need to + replace `torch.save` with another method. Can be configured with the environment variable + `DIFFUSERS_SAVE_MODE`. + safe_serialization (`bool`, *optional*, defaults to `True`): + Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. + """ + state_dict = {} + + if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers): + raise ValueError( + "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`." + ) + + if unet_lora_layers: + state_dict.update(cls.pack_weights(unet_lora_layers, "unet")) + + if text_encoder_lora_layers: + state_dict.update(cls.pack_weights(text_encoder_lora_layers, "text_encoder")) + + if text_encoder_2_lora_layers: + state_dict.update(cls.pack_weights(text_encoder_2_lora_layers, "text_encoder_2")) + + cls.write_lora_layers( + state_dict=state_dict, + save_directory=save_directory, + is_main_process=is_main_process, + weight_name=weight_name, + save_function=save_function, + safe_serialization=safe_serialization, + ) + + def fuse_lora( + self, + components: List[str] = ["unet", "text_encoder", "text_encoder_2"], + lora_scale: float = 1.0, + safe_fusing: bool = False, + adapter_names: Optional[List[str]] = None, + **kwargs, + ): + r""" + Fuses the LoRA parameters into the original parameters of the corresponding blocks. + + + + This is an experimental API. + + + + Args: + components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. + lora_scale (`float`, defaults to 1.0): + Controls how much to influence the outputs with the LoRA parameters. + safe_fusing (`bool`, defaults to `False`): + Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. + adapter_names (`List[str]`, *optional*): + Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. + + Example: + + ```py + from diffusers import DiffusionPipeline + import torch + + pipeline = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 + ).to("cuda") + pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") + pipeline.fuse_lora(lora_scale=0.7) + ``` + """ + super().fuse_lora( + components=components, lora_scale=lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names + ) + + def unfuse_lora(self, components: List[str] = ["unet", "text_encoder", "text_encoder_2"], **kwargs): + r""" + Reverses the effect of + [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). + + + + This is an experimental API. + + + + Args: + components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. + unfuse_unet (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + unfuse_text_encoder (`bool`, defaults to `True`): + Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the + LoRA parameters then it won't have any effect. + """ + super().unfuse_lora(components=components) + + +class SD3LoraLoaderMixin(LoraBaseMixin): + r""" + Load LoRA layers into [`SD3Transformer2DModel`], + [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), and + [`CLIPTextModelWithProjection`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection). + + Specific to [`StableDiffusion3Pipeline`]. + """ + + _lora_loadable_modules = ["transformer", "text_encoder", "text_encoder_2"] + transformer_name = TRANSFORMER_NAME + text_encoder_name = TEXT_ENCODER_NAME + + @classmethod + @validate_hf_hub_args + def lora_state_dict( + cls, + pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], + **kwargs, + ): + r""" + Return state dict for lora weights and the network alphas. + + + + We support loading A1111 formatted LoRA checkpoints in a limited capacity. + + This function is experimental and might change in the future. + + + + Parameters: + pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): + Can be either: + + - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on + the Hub. + - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved + with [`ModelMixin.save_pretrained`]. + - A [torch state + dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). + + cache_dir (`Union[str, os.PathLike]`, *optional*): + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + local_files_only (`bool`, *optional*, defaults to `False`): + Whether to only load local model weights and configuration files or not. If set to `True`, the model + won't be downloaded from the Hub. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. + subfolder (`str`, *optional*, defaults to `""`): + The subfolder location of a model file within a larger model repository on the Hub or locally. + + """ + # Load the main state dict first which has the LoRA layers for either of + # transformer and text encoder or both. + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", None) + token = kwargs.pop("token", None) + revision = kwargs.pop("revision", None) + subfolder = kwargs.pop("subfolder", None) + weight_name = kwargs.pop("weight_name", None) + use_safetensors = kwargs.pop("use_safetensors", None) + + allow_pickle = False + if use_safetensors is None: + use_safetensors = True + allow_pickle = True + + user_agent = { + "file_type": "attn_procs_weights", + "framework": "pytorch", + } + + state_dict = cls._fetch_state_dict( + pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict, + weight_name=weight_name, + use_safetensors=use_safetensors, + local_files_only=local_files_only, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + token=token, + revision=revision, + subfolder=subfolder, + user_agent=user_agent, + allow_pickle=allow_pickle, + ) + + return state_dict + + def load_lora_weights( + self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs + ): + """ + Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and + `self.text_encoder`. + + All kwargs are forwarded to `self.lora_state_dict`. + + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is + loaded. + + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state + dict is loaded into `self.transformer`. + + Parameters: + pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + kwargs (`dict`, *optional*): + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + adapter_name (`str`, *optional*): + Adapter name to be used for referencing the loaded adapter model. If not specified, it will use + `default_{i}` where i is the total number of adapters being loaded. + """ + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + + # if a dict is passed, copy it instead of modifying it inplace + if isinstance(pretrained_model_name_or_path_or_dict, dict): + pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy() + + # First, ensure that the checkpoint is a compatible one and can be successfully loaded. + state_dict = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs) + + is_correct_format = all("lora" in key or "dora_scale" in key for key in state_dict.keys()) + if not is_correct_format: + raise ValueError("Invalid LoRA checkpoint.") + + self.load_lora_into_transformer( + state_dict, + transformer=getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer, + adapter_name=adapter_name, + _pipeline=self, + ) + + text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k} + if len(text_encoder_state_dict) > 0: + self.load_lora_into_text_encoder( + text_encoder_state_dict, + network_alphas=None, + text_encoder=self.text_encoder, + prefix="text_encoder", + lora_scale=self.lora_scale, + adapter_name=adapter_name, + _pipeline=self, + ) + + text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k} + if len(text_encoder_2_state_dict) > 0: + self.load_lora_into_text_encoder( + text_encoder_2_state_dict, + network_alphas=None, + text_encoder=self.text_encoder_2, + prefix="text_encoder_2", + lora_scale=self.lora_scale, + adapter_name=adapter_name, + _pipeline=self, + ) + + @classmethod + def load_lora_into_transformer(cls, state_dict, transformer, adapter_name=None, _pipeline=None): + """ + This will load the LoRA layers specified in `state_dict` into `transformer`. + + Parameters: + state_dict (`dict`): + A standard state dict containing the lora layer parameters. The keys can either be indexed directly + into the unet or prefixed with an additional `unet` which can be used to distinguish between text + encoder lora layers. + transformer (`SD3Transformer2DModel`): + The Transformer model to load the LoRA layers into. + adapter_name (`str`, *optional*): + Adapter name to be used for referencing the loaded adapter model. If not specified, it will use + `default_{i}` where i is the total number of adapters being loaded. + """ + from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict + + keys = list(state_dict.keys()) + + transformer_keys = [k for k in keys if k.startswith(cls.transformer_name)] + state_dict = { + k.replace(f"{cls.transformer_name}.", ""): v for k, v in state_dict.items() if k in transformer_keys + } + + if len(state_dict.keys()) > 0: + # check with first key if is not in peft format + first_key = next(iter(state_dict.keys())) + if "lora_A" not in first_key: + state_dict = convert_unet_state_dict_to_peft(state_dict) + + if adapter_name in getattr(transformer, "peft_config", {}): + raise ValueError( + f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name." + ) + + rank = {} + for key, val in state_dict.items(): + if "lora_B" in key: + rank[key] = val.shape[1] + + lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=None, peft_state_dict=state_dict) + if "use_dora" in lora_config_kwargs: + if lora_config_kwargs["use_dora"] and is_peft_version("<", "0.9.0"): + raise ValueError( + "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`." + ) + else: + lora_config_kwargs.pop("use_dora") + lora_config = LoraConfig(**lora_config_kwargs) + + # adapter_name + if adapter_name is None: + adapter_name = get_adapter_name(transformer) + + # In case the pipeline has been already offloaded to CPU - temporarily remove the hooks + # otherwise loading LoRA weights will lead to an error + is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline) + + inject_adapter_in_model(lora_config, transformer, adapter_name=adapter_name) + incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name) + + if incompatible_keys is not None: + # check only for unexpected keys + unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None) + if unexpected_keys: + logger.warning( + f"Loading adapter weights from state_dict led to unexpected keys not found in the model: " + f" {unexpected_keys}. " + ) + + # Offload back. + if is_model_cpu_offload: + _pipeline.enable_model_cpu_offload() + elif is_sequential_cpu_offload: + _pipeline.enable_sequential_cpu_offload() + # Unsafe code /> + + @classmethod + # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder + def load_lora_into_text_encoder( + cls, + state_dict, + network_alphas, + text_encoder, + prefix=None, + lora_scale=1.0, + adapter_name=None, + _pipeline=None, + ): + """ + This will load the LoRA layers specified in `state_dict` into `text_encoder` + + Parameters: + state_dict (`dict`): + A standard state dict containing the lora layer parameters. The key should be prefixed with an + additional `text_encoder` to distinguish between unet lora layers. + network_alphas (`Dict[str, float]`): + See `LoRALinearLayer` for more details. + text_encoder (`CLIPTextModel`): + The text encoder model to load the LoRA layers into. + prefix (`str`): + Expected prefix of the `text_encoder` in the `state_dict`. + lora_scale (`float`): + How much to scale the output of the lora linear layer before it is added with the output of the regular + lora layer. + adapter_name (`str`, *optional*): + Adapter name to be used for referencing the loaded adapter model. If not specified, it will use + `default_{i}` where i is the total number of adapters being loaded. + """ + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + + from peft import LoraConfig + + # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918), + # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as + # their prefixes. + keys = list(state_dict.keys()) + prefix = cls.text_encoder_name if prefix is None else prefix + + # Safe prefix to check with. + if any(cls.text_encoder_name in key for key in keys): + # Load the layers corresponding to text encoder and make necessary adjustments. + text_encoder_keys = [k for k in keys if k.startswith(prefix) and k.split(".")[0] == prefix] + text_encoder_lora_state_dict = { + k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys + } + + if len(text_encoder_lora_state_dict) > 0: + logger.info(f"Loading {prefix}.") + rank = {} + text_encoder_lora_state_dict = convert_state_dict_to_diffusers(text_encoder_lora_state_dict) + + # convert state dict + text_encoder_lora_state_dict = convert_state_dict_to_peft(text_encoder_lora_state_dict) + + for name, _ in text_encoder_attn_modules(text_encoder): + for module in ("out_proj", "q_proj", "k_proj", "v_proj"): + rank_key = f"{name}.{module}.lora_B.weight" + if rank_key not in text_encoder_lora_state_dict: + continue + rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1] + + for name, _ in text_encoder_mlp_modules(text_encoder): + for module in ("fc1", "fc2"): + rank_key = f"{name}.{module}.lora_B.weight" + if rank_key not in text_encoder_lora_state_dict: + continue + rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1] + + if network_alphas is not None: + alpha_keys = [ + k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix + ] + network_alphas = { + k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys + } + + lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False) + if "use_dora" in lora_config_kwargs: + if lora_config_kwargs["use_dora"]: + if is_peft_version("<", "0.9.0"): + raise ValueError( + "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`." + ) + else: + if is_peft_version("<", "0.9.0"): + lora_config_kwargs.pop("use_dora") + lora_config = LoraConfig(**lora_config_kwargs) + + # adapter_name + if adapter_name is None: + adapter_name = get_adapter_name(text_encoder) + + is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline) + + # inject LoRA layers and load the state dict + # in transformers we automatically check whether the adapter name is already in use or not + text_encoder.load_adapter( + adapter_name=adapter_name, + adapter_state_dict=text_encoder_lora_state_dict, + peft_config=lora_config, + ) + + # scale LoRA layers with `lora_scale` + scale_lora_layers(text_encoder, weight=lora_scale) + + text_encoder.to(device=text_encoder.device, dtype=text_encoder.dtype) + + # Offload back. + if is_model_cpu_offload: + _pipeline.enable_model_cpu_offload() + elif is_sequential_cpu_offload: + _pipeline.enable_sequential_cpu_offload() + # Unsafe code /> + + @classmethod + def save_lora_weights( + cls, + save_directory: Union[str, os.PathLike], + transformer_lora_layers: Dict[str, torch.nn.Module] = None, + text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, + text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, + is_main_process: bool = True, + weight_name: str = None, + save_function: Callable = None, safe_serialization: bool = True, ): r""" @@ -1278,8 +1366,8 @@ def save_lora_weights( Arguments: save_directory (`str` or `os.PathLike`): Directory to save LoRA parameters to. Will be created if it doesn't exist. - unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `unet`. + transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): + State dict of the LoRA layers corresponding to the `transformer`. text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text encoder LoRA state dict because it comes from 🤗 Transformers. @@ -1299,25 +1387,21 @@ def save_lora_weights( """ state_dict = {} - def pack_weights(layers, prefix): - layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers - layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()} - return layers_state_dict - - if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers): + if not (transformer_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers): raise ValueError( - "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`." + "You must pass at least one of `transformer_lora_layers`, `text_encoder_lora_layers`, `text_encoder_2_lora_layers`." ) - if unet_lora_layers: - state_dict.update(pack_weights(unet_lora_layers, "unet")) + if transformer_lora_layers: + state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) if text_encoder_lora_layers: - state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder")) + state_dict.update(cls.pack_weights(text_encoder_lora_layers, "text_encoder")) if text_encoder_2_lora_layers: - state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2")) + state_dict.update(cls.pack_weights(text_encoder_2_lora_layers, "text_encoder_2")) + # Save the model cls.write_lora_layers( state_dict=state_dict, save_directory=save_directory, @@ -1327,198 +1411,79 @@ def pack_weights(layers, prefix): safe_serialization=safe_serialization, ) - def _remove_text_encoder_monkey_patch(self): - recurse_remove_peft_layers(self.text_encoder) - # TODO: @younesbelkada handle this in transformers side - if getattr(self.text_encoder, "peft_config", None) is not None: - del self.text_encoder.peft_config - self.text_encoder._hf_peft_config_loaded = None - - recurse_remove_peft_layers(self.text_encoder_2) - if getattr(self.text_encoder_2, "peft_config", None) is not None: - del self.text_encoder_2.peft_config - self.text_encoder_2._hf_peft_config_loaded = None - - -class SD3LoraLoaderMixin: - r""" - Load LoRA layers into [`SD3Transformer2DModel`]. - """ - - transformer_name = TRANSFORMER_NAME - num_fused_loras = 0 - - def load_lora_weights( - self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs - ): - """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and - `self.text_encoder`. - - All kwargs are forwarded to `self.lora_state_dict`. - - See [`~loaders.LoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. - - See [`~loaders.LoraLoaderMixin.load_lora_into_transformer`] for more details on how the state dict is loaded - into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.LoraLoaderMixin.lora_state_dict`]. - kwargs (`dict`, *optional*): - See [`~loaders.LoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - """ - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for this method.") - - # if a dict is passed, copy it instead of modifying it inplace - if isinstance(pretrained_model_name_or_path_or_dict, dict): - pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy() - - # First, ensure that the checkpoint is a compatible one and can be successfully loaded. - state_dict = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs) - - is_correct_format = all("lora" in key or "dora_scale" in key for key in state_dict.keys()) - if not is_correct_format: - raise ValueError("Invalid LoRA checkpoint.") - - self.load_lora_into_transformer( - state_dict, - transformer=getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer, - adapter_name=adapter_name, - _pipeline=self, - ) - - @classmethod - @validate_hf_hub_args - def lora_state_dict( - cls, - pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], + def fuse_lora( + self, + components: List[str] = ["transformer", "text_encoder", "text_encoder_2"], + lora_scale: float = 1.0, + safe_fusing: bool = False, + adapter_names: Optional[List[str]] = None, **kwargs, ): r""" - Return state dict for lora weights and the network alphas. + Fuses the LoRA parameters into the original parameters of the corresponding blocks. - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. + This is an experimental API. - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: + Args: + components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. + lora_scale (`float`, defaults to 1.0): + Controls how much to influence the outputs with the LoRA parameters. + safe_fusing (`bool`, defaults to `False`): + Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. + adapter_names (`List[str]`, *optional*): + Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). + Example: - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. + ```py + from diffusers import DiffusionPipeline + import torch + pipeline = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 + ).to("cuda") + pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") + pipeline.fuse_lora(lora_scale=0.7) + ``` """ - # Load the main state dict first which has the LoRA layers for either of - # UNet and text encoder or both. - cache_dir = kwargs.pop("cache_dir", None) - force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) - proxies = kwargs.pop("proxies", None) - local_files_only = kwargs.pop("local_files_only", None) - token = kwargs.pop("token", None) - revision = kwargs.pop("revision", None) - subfolder = kwargs.pop("subfolder", None) - weight_name = kwargs.pop("weight_name", None) - use_safetensors = kwargs.pop("use_safetensors", None) + super().fuse_lora( + components=components, lora_scale=lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names + ) + + def unfuse_lora(self, components: List[str] = ["transformer", "text_encoder", "text_encoder_2"], **kwargs): + r""" + Reverses the effect of + [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - allow_pickle = False - if use_safetensors is None: - use_safetensors = True - allow_pickle = True + - user_agent = { - "file_type": "attn_procs_weights", - "framework": "pytorch", - } + This is an experimental API. - model_file = None - if not isinstance(pretrained_model_name_or_path_or_dict, dict): - # Let's first try to load .safetensors weights - if (use_safetensors and weight_name is None) or ( - weight_name is not None and weight_name.endswith(".safetensors") - ): - try: - model_file = _get_model_file( - pretrained_model_name_or_path_or_dict, - weights_name=weight_name or LORA_WEIGHT_NAME_SAFE, - cache_dir=cache_dir, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - token=token, - revision=revision, - subfolder=subfolder, - user_agent=user_agent, - ) - state_dict = safetensors.torch.load_file(model_file, device="cpu") - except (IOError, safetensors.SafetensorError) as e: - if not allow_pickle: - raise e - # try loading non-safetensors weights - model_file = None - pass - - if model_file is None: - model_file = _get_model_file( - pretrained_model_name_or_path_or_dict, - weights_name=weight_name or LORA_WEIGHT_NAME, - cache_dir=cache_dir, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - token=token, - revision=revision, - subfolder=subfolder, - user_agent=user_agent, - ) - state_dict = load_state_dict(model_file) - else: - state_dict = pretrained_model_name_or_path_or_dict + - return state_dict + Args: + components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. + unfuse_unet (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + unfuse_text_encoder (`bool`, defaults to `True`): + Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the + LoRA parameters then it won't have any effect. + """ + super().unfuse_lora(components=components) + + +# The reason why we subclass from `StableDiffusionLoraLoaderMixin` here is because Amused initially +# relied on `StableDiffusionLoraLoaderMixin` for its LoRA support. +class AmusedLoraLoaderMixin(StableDiffusionLoraLoaderMixin): + _lora_loadable_modules = ["transformer", "text_encoder"] + transformer_name = TRANSFORMER_NAME + text_encoder_name = TEXT_ENCODER_NAME @classmethod - def load_lora_into_transformer(cls, state_dict, transformer, adapter_name=None, _pipeline=None): + def load_lora_into_transformer(cls, state_dict, network_alphas, transformer, adapter_name=None, _pipeline=None): """ This will load the LoRA layers specified in `state_dict` into `transformer`. @@ -1527,12 +1492,17 @@ def load_lora_into_transformer(cls, state_dict, transformer, adapter_name=None, A standard state dict containing the lora layer parameters. The keys can either be indexed directly into the unet or prefixed with an additional `unet` which can be used to distinguish between text encoder lora layers. - transformer (`SD3Transformer2DModel`): - The Transformer model to load the LoRA layers into. + network_alphas (`Dict[str, float]`): + See `LoRALinearLayer` for more details. + unet (`UNet2DConditionModel`): + The UNet model to load the LoRA layers into. adapter_name (`str`, *optional*): Adapter name to be used for referencing the loaded adapter model. If not specified, it will use `default_{i}` where i is the total number of adapters being loaded. """ + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict keys = list(state_dict.keys()) @@ -1542,12 +1512,13 @@ def load_lora_into_transformer(cls, state_dict, transformer, adapter_name=None, k.replace(f"{cls.transformer_name}.", ""): v for k, v in state_dict.items() if k in transformer_keys } - if len(state_dict.keys()) > 0: - # check with first key if is not in peft format - first_key = next(iter(state_dict.keys())) - if "lora_A" not in first_key: - state_dict = convert_unet_state_dict_to_peft(state_dict) + if network_alphas is not None: + alpha_keys = [k for k in network_alphas.keys() if k.startswith(cls.transformer_name)] + network_alphas = { + k.replace(f"{cls.transformer_name}.", ""): v for k, v in network_alphas.items() if k in alpha_keys + } + if len(state_dict.keys()) > 0: if adapter_name in getattr(transformer, "peft_config", {}): raise ValueError( f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name." @@ -1558,7 +1529,7 @@ def load_lora_into_transformer(cls, state_dict, transformer, adapter_name=None, if "lora_B" in key: rank[key] = val.shape[1] - lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=None, peft_state_dict=state_dict) + lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict) if "use_dora" in lora_config_kwargs: if lora_config_kwargs["use_dora"] and is_peft_version("<", "0.9.0"): raise ValueError( @@ -1595,13 +1566,131 @@ def load_lora_into_transformer(cls, state_dict, transformer, adapter_name=None, _pipeline.enable_sequential_cpu_offload() # Unsafe code /> + @classmethod + # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder + def load_lora_into_text_encoder( + cls, + state_dict, + network_alphas, + text_encoder, + prefix=None, + lora_scale=1.0, + adapter_name=None, + _pipeline=None, + ): + """ + This will load the LoRA layers specified in `state_dict` into `text_encoder` + + Parameters: + state_dict (`dict`): + A standard state dict containing the lora layer parameters. The key should be prefixed with an + additional `text_encoder` to distinguish between unet lora layers. + network_alphas (`Dict[str, float]`): + See `LoRALinearLayer` for more details. + text_encoder (`CLIPTextModel`): + The text encoder model to load the LoRA layers into. + prefix (`str`): + Expected prefix of the `text_encoder` in the `state_dict`. + lora_scale (`float`): + How much to scale the output of the lora linear layer before it is added with the output of the regular + lora layer. + adapter_name (`str`, *optional*): + Adapter name to be used for referencing the loaded adapter model. If not specified, it will use + `default_{i}` where i is the total number of adapters being loaded. + """ + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + + from peft import LoraConfig + + # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918), + # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as + # their prefixes. + keys = list(state_dict.keys()) + prefix = cls.text_encoder_name if prefix is None else prefix + + # Safe prefix to check with. + if any(cls.text_encoder_name in key for key in keys): + # Load the layers corresponding to text encoder and make necessary adjustments. + text_encoder_keys = [k for k in keys if k.startswith(prefix) and k.split(".")[0] == prefix] + text_encoder_lora_state_dict = { + k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys + } + + if len(text_encoder_lora_state_dict) > 0: + logger.info(f"Loading {prefix}.") + rank = {} + text_encoder_lora_state_dict = convert_state_dict_to_diffusers(text_encoder_lora_state_dict) + + # convert state dict + text_encoder_lora_state_dict = convert_state_dict_to_peft(text_encoder_lora_state_dict) + + for name, _ in text_encoder_attn_modules(text_encoder): + for module in ("out_proj", "q_proj", "k_proj", "v_proj"): + rank_key = f"{name}.{module}.lora_B.weight" + if rank_key not in text_encoder_lora_state_dict: + continue + rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1] + + for name, _ in text_encoder_mlp_modules(text_encoder): + for module in ("fc1", "fc2"): + rank_key = f"{name}.{module}.lora_B.weight" + if rank_key not in text_encoder_lora_state_dict: + continue + rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1] + + if network_alphas is not None: + alpha_keys = [ + k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix + ] + network_alphas = { + k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys + } + + lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False) + if "use_dora" in lora_config_kwargs: + if lora_config_kwargs["use_dora"]: + if is_peft_version("<", "0.9.0"): + raise ValueError( + "You need `peft` 0.9.0 at least to use DoRA-enabled LoRAs. Please upgrade your installation of `peft`." + ) + else: + if is_peft_version("<", "0.9.0"): + lora_config_kwargs.pop("use_dora") + lora_config = LoraConfig(**lora_config_kwargs) + + # adapter_name + if adapter_name is None: + adapter_name = get_adapter_name(text_encoder) + + is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline) + + # inject LoRA layers and load the state dict + # in transformers we automatically check whether the adapter name is already in use or not + text_encoder.load_adapter( + adapter_name=adapter_name, + adapter_state_dict=text_encoder_lora_state_dict, + peft_config=lora_config, + ) + + # scale LoRA layers with `lora_scale` + scale_lora_layers(text_encoder, weight=lora_scale) + + text_encoder.to(device=text_encoder.device, dtype=text_encoder.dtype) + + # Offload back. + if is_model_cpu_offload: + _pipeline.enable_model_cpu_offload() + elif is_sequential_cpu_offload: + _pipeline.enable_sequential_cpu_offload() + # Unsafe code /> + @classmethod def save_lora_weights( cls, save_directory: Union[str, os.PathLike], + text_encoder_lora_layers: Dict[str, torch.nn.Module] = None, transformer_lora_layers: Dict[str, torch.nn.Module] = None, - text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, - text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None, is_main_process: bool = True, weight_name: str = None, save_function: Callable = None, @@ -1613,8 +1702,11 @@ def save_lora_weights( Arguments: save_directory (`str` or `os.PathLike`): Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. + unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): + State dict of the LoRA layers corresponding to the `unet`. + text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): + State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text + encoder LoRA state dict because it comes from 🤗 Transformers. is_main_process (`bool`, *optional*, defaults to `True`): Whether the process calling this is the main process or not. Useful during distributed training and you need to call this function on all processes. In this case, set `is_main_process=True` only on the main @@ -1628,24 +1720,14 @@ def save_lora_weights( """ state_dict = {} - def pack_weights(layers, prefix): - layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers - layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()} - return layers_state_dict - - if not (transformer_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers): - raise ValueError( - "You must pass at least one of `transformer_lora_layers`, `text_encoder_lora_layers`, `text_encoder_2_lora_layers`." - ) + if not (transformer_lora_layers or text_encoder_lora_layers): + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") if transformer_lora_layers: - state_dict.update(pack_weights(transformer_lora_layers, cls.transformer_name)) + state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) if text_encoder_lora_layers: - state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder")) - - if text_encoder_2_lora_layers: - state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2")) + state_dict.update(cls.pack_weights(text_encoder_lora_layers, cls.text_encoder_name)) # Save the model cls.write_lora_layers( @@ -1657,164 +1739,9 @@ def pack_weights(layers, prefix): safe_serialization=safe_serialization, ) - @staticmethod - def write_lora_layers( - state_dict: Dict[str, torch.Tensor], - save_directory: str, - is_main_process: bool, - weight_name: str, - save_function: Callable, - safe_serialization: bool, - ): - if os.path.isfile(save_directory): - logger.error(f"Provided path ({save_directory}) should be a directory, not a file") - return - - if save_function is None: - if safe_serialization: - - def save_function(weights, filename): - return safetensors.torch.save_file(weights, filename, metadata={"format": "pt"}) - - else: - save_function = torch.save - - os.makedirs(save_directory, exist_ok=True) - - if weight_name is None: - if safe_serialization: - weight_name = LORA_WEIGHT_NAME_SAFE - else: - weight_name = LORA_WEIGHT_NAME - - save_path = Path(save_directory, weight_name).as_posix() - save_function(state_dict, save_path) - logger.info(f"Model weights saved in {save_path}") - - def unload_lora_weights(self): - """ - Unloads the LoRA parameters. - - Examples: - - ```python - >>> # Assuming `pipeline` is already loaded with the LoRA parameters. - >>> pipeline.unload_lora_weights() - >>> ... - ``` - """ - transformer = getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer - recurse_remove_peft_layers(transformer) - if hasattr(transformer, "peft_config"): - del transformer.peft_config - - @classmethod - # Copied from diffusers.loaders.lora.LoraLoaderMixin._optionally_disable_offloading - def _optionally_disable_offloading(cls, _pipeline): - """ - Optionally removes offloading in case the pipeline has been already sequentially offloaded to CPU. - - Args: - _pipeline (`DiffusionPipeline`): - The pipeline to disable offloading for. - - Returns: - tuple: - A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True. - """ - is_model_cpu_offload = False - is_sequential_cpu_offload = False - - if _pipeline is not None and _pipeline.hf_device_map is None: - for _, component in _pipeline.components.items(): - if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"): - if not is_model_cpu_offload: - is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload) - if not is_sequential_cpu_offload: - is_sequential_cpu_offload = ( - isinstance(component._hf_hook, AlignDevicesHook) - or hasattr(component._hf_hook, "hooks") - and isinstance(component._hf_hook.hooks[0], AlignDevicesHook) - ) - - logger.info( - "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again." - ) - remove_hook_from_module(component, recurse=is_sequential_cpu_offload) - - return (is_model_cpu_offload, is_sequential_cpu_offload) - - def fuse_lora( - self, - fuse_transformer: bool = True, - lora_scale: float = 1.0, - safe_fusing: bool = False, - adapter_names: Optional[List[str]] = None, - ): - r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - fuse_transformer (`bool`, defaults to `True`): Whether to fuse the transformer LoRA parameters. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights( - "nerijs/pixel-art-medium-128-v0.1", - weight_name="pixel-art-medium-128-v0.1.safetensors", - adapter_name="pixel", - ) - pipeline.fuse_lora(lora_scale=0.7) - ``` - """ - if fuse_transformer: - self.num_fused_loras += 1 - - if fuse_transformer: - transformer = ( - getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer - ) - transformer.fuse_lora(lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names) - - def unfuse_lora(self, unfuse_transformer: bool = True): - r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the transformer LoRA parameters. - """ - from peft.tuners.tuners_utils import BaseTunerLayer - - transformer = getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer - if unfuse_transformer: - for module in transformer.modules(): - if isinstance(module, BaseTunerLayer): - module.unmerge() - self.num_fused_loras -= 1 +class LoraLoaderMixin(StableDiffusionLoraLoaderMixin): + def __init__(self, *args, **kwargs): + deprecation_message = "LoraLoaderMixin is deprecated and this will be removed in a future version. Please use `StableDiffusionLoraLoaderMixin`, instead." + deprecate("LoraLoaderMixin", "1.0.0", deprecation_message) + super().__init__(*args, **kwargs) diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py index 5892c2865374..5625f9755b19 100644 --- a/src/diffusers/loaders/peft.py +++ b/src/diffusers/loaders/peft.py @@ -12,15 +12,32 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Union +import inspect +from functools import partial +from typing import Dict, List, Optional, Union -from ..utils import MIN_PEFT_VERSION, check_peft_version, is_peft_available +from ..utils import ( + MIN_PEFT_VERSION, + USE_PEFT_BACKEND, + check_peft_version, + delete_adapter_layers, + is_peft_available, + set_adapter_layers, + set_weights_and_activate_adapters, +) +from .unet_loader_utils import _maybe_expand_lora_scales + + +_SET_ADAPTER_SCALE_FN_MAPPING = { + "UNet2DConditionModel": _maybe_expand_lora_scales, + "SD3Transformer2DModel": lambda model_cls, weights: weights, +} class PeftAdapterMixin: """ A class containing all functions for loading and using adapters weights that are supported in PEFT library. For - more details about adapters and injecting them in a transformer-based model, check out the PEFT + more details about adapters and injecting them in a base model, check out the PEFT [documentation](https://huggingface.co/docs/peft/index). Install the latest version of PEFT, and use this mixin to: @@ -33,6 +50,62 @@ class PeftAdapterMixin: _hf_peft_config_loaded = False + def set_adapters( + self, + adapter_names: Union[List[str], str], + weights: Optional[Union[float, Dict, List[float], List[Dict], List[None]]] = None, + ): + """ + Set the currently active adapters for use in the UNet. + + Args: + adapter_names (`List[str]` or `str`): + The names of the adapters to use. + adapter_weights (`Union[List[float], float]`, *optional*): + The adapter(s) weights to use with the UNet. If `None`, the weights are set to `1.0` for all the + adapters. + + Example: + + ```py + from diffusers import AutoPipelineForText2Image + import torch + + pipeline = AutoPipelineForText2Image.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 + ).to("cuda") + pipeline.load_lora_weights( + "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic" + ) + pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") + pipeline.set_adapters(["cinematic", "pixel"], adapter_weights=[0.5, 0.5]) + ``` + """ + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for `set_adapters()`.") + + adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names + + # Expand weights into a list, one entry per adapter + # examples for e.g. 2 adapters: [{...}, 7] -> [7,7] ; None -> [None, None] + if not isinstance(weights, list): + weights = [weights] * len(adapter_names) + + if len(adapter_names) != len(weights): + raise ValueError( + f"Length of adapter names {len(adapter_names)} is not equal to the length of their weights {len(weights)}." + ) + + # Set None values to default of 1.0 + # e.g. [{...}, 7] -> [{...}, 7] ; [None, None] -> [1.0, 1.0] + weights = [w if w is not None else 1.0 for w in weights] + + # e.g. [{...}, 7] -> [{expanded dict...}, 7] + scale_expansion_fn = _SET_ADAPTER_SCALE_FN_MAPPING[self.__class__.__name__] + weights = scale_expansion_fn(self, weights) + + set_weights_and_activate_adapters(self, adapter_names, weights) + def add_adapter(self, adapter_config, adapter_name: str = "default") -> None: r""" Adds a new adapter to the current model for training. If no adapter name is passed, a default name is assigned @@ -66,7 +139,7 @@ def add_adapter(self, adapter_config, adapter_name: str = "default") -> None: ) # Unlike transformers, here we don't need to retrieve the name_or_path of the unet as the loading logic is - # handled by the `load_lora_layers` or `LoraLoaderMixin`. Therefore we set it to `None` here. + # handled by the `load_lora_layers` or `StableDiffusionLoraLoaderMixin`. Therefore we set it to `None` here. adapter_config.base_model_name_or_path = None inject_adapter_in_model(adapter_config, self, adapter_name) self.set_adapter(adapter_name) @@ -185,3 +258,136 @@ def active_adapters(self) -> List[str]: for _, module in self.named_modules(): if isinstance(module, BaseTunerLayer): return module.active_adapter + + def fuse_lora(self, lora_scale=1.0, safe_fusing=False, adapter_names=None): + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for `fuse_lora()`.") + + self.lora_scale = lora_scale + self._safe_fusing = safe_fusing + self.apply(partial(self._fuse_lora_apply, adapter_names=adapter_names)) + + def _fuse_lora_apply(self, module, adapter_names=None): + from peft.tuners.tuners_utils import BaseTunerLayer + + merge_kwargs = {"safe_merge": self._safe_fusing} + + if isinstance(module, BaseTunerLayer): + if self.lora_scale != 1.0: + module.scale_layer(self.lora_scale) + + # For BC with prevous PEFT versions, we need to check the signature + # of the `merge` method to see if it supports the `adapter_names` argument. + supported_merge_kwargs = list(inspect.signature(module.merge).parameters) + if "adapter_names" in supported_merge_kwargs: + merge_kwargs["adapter_names"] = adapter_names + elif "adapter_names" not in supported_merge_kwargs and adapter_names is not None: + raise ValueError( + "The `adapter_names` argument is not supported with your PEFT version. Please upgrade" + " to the latest version of PEFT. `pip install -U peft`" + ) + + module.merge(**merge_kwargs) + + def unfuse_lora(self): + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for `unfuse_lora()`.") + self.apply(self._unfuse_lora_apply) + + def _unfuse_lora_apply(self, module): + from peft.tuners.tuners_utils import BaseTunerLayer + + if isinstance(module, BaseTunerLayer): + module.unmerge() + + def unload_lora(self): + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for `unload_lora()`.") + + from ..utils import recurse_remove_peft_layers + + recurse_remove_peft_layers(self) + if hasattr(self, "peft_config"): + del self.peft_config + + def disable_lora(self): + """ + Disables the active LoRA layers of the underlying model. + + Example: + + ```py + from diffusers import AutoPipelineForText2Image + import torch + + pipeline = AutoPipelineForText2Image.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 + ).to("cuda") + pipeline.load_lora_weights( + "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic" + ) + pipeline.disable_lora() + ``` + """ + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + set_adapter_layers(self, enabled=False) + + def enable_lora(self): + """ + Enables the active LoRA layers of the underlying model. + + Example: + + ```py + from diffusers import AutoPipelineForText2Image + import torch + + pipeline = AutoPipelineForText2Image.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 + ).to("cuda") + pipeline.load_lora_weights( + "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic" + ) + pipeline.enable_lora() + ``` + """ + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + set_adapter_layers(self, enabled=True) + + def delete_adapters(self, adapter_names: Union[List[str], str]): + """ + Delete an adapter's LoRA layers from the underlying model. + + Args: + adapter_names (`Union[List[str], str]`): + The names (single string or list of strings) of the adapter to delete. + + Example: + + ```py + from diffusers import AutoPipelineForText2Image + import torch + + pipeline = AutoPipelineForText2Image.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 + ).to("cuda") + pipeline.load_lora_weights( + "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_names="cinematic" + ) + pipeline.delete_adapters("cinematic") + ``` + """ + if not USE_PEFT_BACKEND: + raise ValueError("PEFT backend is required for this method.") + + if isinstance(adapter_names, str): + adapter_names = [adapter_names] + + for adapter_name in adapter_names: + delete_adapter_layers(self, adapter_name) + + # Pop also the corresponding adapter from the config + if hasattr(self, "peft_config"): + self.peft_config.pop(adapter_name, None) diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py index f6e6373ce035..f8ef9a8a74ab 100644 --- a/src/diffusers/loaders/single_file.py +++ b/src/diffusers/loaders/single_file.py @@ -242,7 +242,6 @@ def _download_diffusers_model_config_from_hub( revision, proxies, force_download=None, - resume_download=None, local_files_only=None, token=None, ): @@ -253,7 +252,6 @@ def _download_diffusers_model_config_from_hub( revision=revision, proxies=proxies, force_download=force_download, - resume_download=resume_download, local_files_only=local_files_only, token=token, allow_patterns=allow_patterns, @@ -288,9 +286,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -352,7 +348,6 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): deprecate("original_config_file", "1.0.0", deprecation_message) original_config = original_config_file - resume_download = kwargs.pop("resume_download", None) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) @@ -382,7 +377,6 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): checkpoint = load_single_file_checkpoint( pretrained_model_link_or_path, - resume_download=resume_download, force_download=force_download, proxies=proxies, token=token, @@ -412,7 +406,6 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): revision=revision, proxies=proxies, force_download=force_download, - resume_download=resume_download, local_files_only=local_files_only, token=token, ) @@ -435,7 +428,6 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): revision=revision, proxies=proxies, force_download=force_download, - resume_download=resume_download, local_files_only=False, token=token, ) diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py index 3a8a44e81506..92438620abd8 100644 --- a/src/diffusers/loaders/single_file_model.py +++ b/src/diffusers/loaders/single_file_model.py @@ -137,9 +137,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -188,7 +186,6 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = "`from_single_file` cannot accept both `config` and `original_config` arguments. Please provide only one of these arguments" ) - resume_download = kwargs.pop("resume_download", None) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) @@ -203,7 +200,6 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = else: checkpoint = load_single_file_checkpoint( pretrained_model_link_or_path_or_dict, - resume_download=resume_download, force_download=force_download, proxies=proxies, token=token, diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index 726bf4733ac5..483125f24825 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -313,7 +313,6 @@ def _is_model_weights_in_cached_folder(cached_folder, name): def load_single_file_checkpoint( pretrained_model_link_or_path, - resume_download=False, force_download=False, proxies=None, token=None, @@ -331,7 +330,6 @@ def load_single_file_checkpoint( weights_name=weights_name, force_download=force_download, cache_dir=cache_dir, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py index b6e1545e16dd..574b89233cc1 100644 --- a/src/diffusers/loaders/textual_inversion.py +++ b/src/diffusers/loaders/textual_inversion.py @@ -38,7 +38,6 @@ def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs): cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) token = kwargs.pop("token", None) @@ -72,7 +71,6 @@ def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs) weights_name=weight_name or TEXT_INVERSION_NAME_SAFE, cache_dir=cache_dir, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -93,7 +91,6 @@ def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs) weights_name=weight_name or TEXT_INVERSION_NAME, cache_dir=cache_dir, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -308,9 +305,7 @@ def load_textual_inversion( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py index 58c9c0e60d17..32ace77b6224 100644 --- a/src/diffusers/loaders/unet.py +++ b/src/diffusers/loaders/unet.py @@ -11,13 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import inspect import os from collections import defaultdict from contextlib import nullcontext -from functools import partial from pathlib import Path -from typing import Callable, Dict, List, Optional, Union +from typing import Callable, Dict, Union import safetensors import torch @@ -38,18 +36,14 @@ USE_PEFT_BACKEND, _get_model_file, convert_unet_state_dict_to_peft, - delete_adapter_layers, get_adapter_name, get_peft_kwargs, is_accelerate_available, is_peft_version, is_torch_version, logging, - set_adapter_layers, - set_weights_and_activate_adapters, ) -from .lora import LORA_WEIGHT_NAME, LORA_WEIGHT_NAME_SAFE, TEXT_ENCODER_NAME, UNET_NAME -from .unet_loader_utils import _maybe_expand_lora_scales +from .lora_pipeline import LORA_WEIGHT_NAME, LORA_WEIGHT_NAME_SAFE, TEXT_ENCODER_NAME, UNET_NAME from .utils import AttnProcsLayers @@ -97,9 +91,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -140,7 +132,6 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) token = kwargs.pop("token", None) @@ -174,7 +165,6 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict weights_name=weight_name or LORA_WEIGHT_NAME_SAFE, cache_dir=cache_dir, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -194,7 +184,6 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict weights_name=weight_name or LORA_WEIGHT_NAME, cache_dir=cache_dir, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -362,7 +351,7 @@ def _process_lora(self, state_dict, unet_identifier_key, network_alphas, adapter return is_model_cpu_offload, is_sequential_cpu_offload @classmethod - # Copied from diffusers.loaders.lora.LoraLoaderMixin._optionally_disable_offloading + # Copied from diffusers.loaders.lora_base.LoraBaseMixin._optionally_disable_offloading def _optionally_disable_offloading(cls, _pipeline): """ Optionally removes offloading in case the pipeline has been already sequentially offloaded to CPU. @@ -524,194 +513,6 @@ def _get_custom_diffusion_state_dict(self): return state_dict - def fuse_lora(self, lora_scale=1.0, safe_fusing=False, adapter_names=None): - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for `fuse_lora()`.") - - self.lora_scale = lora_scale - self._safe_fusing = safe_fusing - self.apply(partial(self._fuse_lora_apply, adapter_names=adapter_names)) - - def _fuse_lora_apply(self, module, adapter_names=None): - from peft.tuners.tuners_utils import BaseTunerLayer - - merge_kwargs = {"safe_merge": self._safe_fusing} - - if isinstance(module, BaseTunerLayer): - if self.lora_scale != 1.0: - module.scale_layer(self.lora_scale) - - # For BC with prevous PEFT versions, we need to check the signature - # of the `merge` method to see if it supports the `adapter_names` argument. - supported_merge_kwargs = list(inspect.signature(module.merge).parameters) - if "adapter_names" in supported_merge_kwargs: - merge_kwargs["adapter_names"] = adapter_names - elif "adapter_names" not in supported_merge_kwargs and adapter_names is not None: - raise ValueError( - "The `adapter_names` argument is not supported with your PEFT version. Please upgrade" - " to the latest version of PEFT. `pip install -U peft`" - ) - - module.merge(**merge_kwargs) - - def unfuse_lora(self): - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for `unfuse_lora()`.") - self.apply(self._unfuse_lora_apply) - - def _unfuse_lora_apply(self, module): - from peft.tuners.tuners_utils import BaseTunerLayer - - if isinstance(module, BaseTunerLayer): - module.unmerge() - - def unload_lora(self): - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for `unload_lora()`.") - - from ..utils import recurse_remove_peft_layers - - recurse_remove_peft_layers(self) - if hasattr(self, "peft_config"): - del self.peft_config - - def set_adapters( - self, - adapter_names: Union[List[str], str], - weights: Optional[Union[float, Dict, List[float], List[Dict], List[None]]] = None, - ): - """ - Set the currently active adapters for use in the UNet. - - Args: - adapter_names (`List[str]` or `str`): - The names of the adapters to use. - adapter_weights (`Union[List[float], float]`, *optional*): - The adapter(s) weights to use with the UNet. If `None`, the weights are set to `1.0` for all the - adapters. - - Example: - - ```py - from diffusers import AutoPipelineForText2Image - import torch - - pipeline = AutoPipelineForText2Image.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights( - "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic" - ) - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.set_adapters(["cinematic", "pixel"], adapter_weights=[0.5, 0.5]) - ``` - """ - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for `set_adapters()`.") - - adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names - - # Expand weights into a list, one entry per adapter - # examples for e.g. 2 adapters: [{...}, 7] -> [7,7] ; None -> [None, None] - if not isinstance(weights, list): - weights = [weights] * len(adapter_names) - - if len(adapter_names) != len(weights): - raise ValueError( - f"Length of adapter names {len(adapter_names)} is not equal to the length of their weights {len(weights)}." - ) - - # Set None values to default of 1.0 - # e.g. [{...}, 7] -> [{...}, 7] ; [None, None] -> [1.0, 1.0] - weights = [w if w is not None else 1.0 for w in weights] - - # e.g. [{...}, 7] -> [{expanded dict...}, 7] - weights = _maybe_expand_lora_scales(self, weights) - - set_weights_and_activate_adapters(self, adapter_names, weights) - - def disable_lora(self): - """ - Disable the UNet's active LoRA layers. - - Example: - - ```py - from diffusers import AutoPipelineForText2Image - import torch - - pipeline = AutoPipelineForText2Image.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights( - "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic" - ) - pipeline.disable_lora() - ``` - """ - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for this method.") - set_adapter_layers(self, enabled=False) - - def enable_lora(self): - """ - Enable the UNet's active LoRA layers. - - Example: - - ```py - from diffusers import AutoPipelineForText2Image - import torch - - pipeline = AutoPipelineForText2Image.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights( - "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic" - ) - pipeline.enable_lora() - ``` - """ - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for this method.") - set_adapter_layers(self, enabled=True) - - def delete_adapters(self, adapter_names: Union[List[str], str]): - """ - Delete an adapter's LoRA layers from the UNet. - - Args: - adapter_names (`Union[List[str], str]`): - The names (single string or list of strings) of the adapter to delete. - - Example: - - ```py - from diffusers import AutoPipelineForText2Image - import torch - - pipeline = AutoPipelineForText2Image.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights( - "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_names="cinematic" - ) - pipeline.delete_adapters("cinematic") - ``` - """ - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for this method.") - - if isinstance(adapter_names, str): - adapter_names = [adapter_names] - - for adapter_name in adapter_names: - delete_adapter_layers(self, adapter_name) - - # Pop also the corresponding adapter from the config - if hasattr(self, "peft_config"): - self.peft_config.pop(adapter_name, None) - def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_usage=False): if low_cpu_mem_usage: if is_accelerate_available(): @@ -1022,6 +823,15 @@ def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=F def _load_ip_adapter_weights(self, state_dicts, low_cpu_mem_usage=False): if not isinstance(state_dicts, list): state_dicts = [state_dicts] + + # Kolors Unet already has a `encoder_hid_proj` + if ( + self.encoder_hid_proj is not None + and self.config.encoder_hid_dim_type == "text_proj" + and not hasattr(self, "text_encoder_hid_proj") + ): + self.text_encoder_hid_proj = self.encoder_hid_proj + # Set encoder_hid_proj after loading ip_adapter weights, # because `IPAdapterPlusImageProjection` also has `attn_processors`. self.encoder_hid_proj = None diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py index 39dc149ff6d1..d35786ee7642 100644 --- a/src/diffusers/models/__init__.py +++ b/src/diffusers/models/__init__.py @@ -35,6 +35,7 @@ _import_structure["controlnet"] = ["ControlNetModel"] _import_structure["controlnet_hunyuan"] = ["HunyuanDiT2DControlNetModel", "HunyuanDiT2DMultiControlNetModel"] _import_structure["controlnet_sd3"] = ["SD3ControlNetModel", "SD3MultiControlNetModel"] + _import_structure["controlnet_sparsectrl"] = ["SparseControlNetModel"] _import_structure["controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"] _import_structure["embeddings"] = ["ImageProjection"] _import_structure["modeling_utils"] = ["ModelMixin"] @@ -81,6 +82,7 @@ from .controlnet import ControlNetModel from .controlnet_hunyuan import HunyuanDiT2DControlNetModel, HunyuanDiT2DMultiControlNetModel from .controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel + from .controlnet_sparsectrl import SparseControlNetModel from .controlnet_xs import ControlNetXSAdapter, UNetControlNetXSModel from .embeddings import ImageProjection from .modeling_utils import ModelMixin diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 961bcd2a49e8..6669222c695d 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -677,6 +677,21 @@ def fuse_projections(self, fuse=True): concatenated_bias = torch.cat([self.to_k.bias.data, self.to_v.bias.data]) self.to_kv.bias.copy_(concatenated_bias) + # handle added projections for SD3 and others. + if hasattr(self, "add_q_proj") and hasattr(self, "add_k_proj") and hasattr(self, "add_v_proj"): + concatenated_weights = torch.cat( + [self.add_q_proj.weight.data, self.add_k_proj.weight.data, self.add_v_proj.weight.data] + ) + in_features = concatenated_weights.shape[1] + out_features = concatenated_weights.shape[0] + + self.to_added_qkv = nn.Linear(in_features, out_features, bias=True, device=device, dtype=dtype) + self.to_added_qkv.weight.copy_(concatenated_weights) + concatenated_bias = torch.cat( + [self.add_q_proj.bias.data, self.add_k_proj.bias.data, self.add_v_proj.bias.data] + ) + self.to_added_qkv.bias.copy_(concatenated_bias) + self.fused_projections = fuse @@ -1167,7 +1182,6 @@ def __call__( attn: Attention, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor = None, - i=0, *args, **kwargs, ) -> torch.FloatTensor: @@ -1708,6 +1722,109 @@ def __call__( return hidden_states +class FusedHunyuanAttnProcessor2_0: + r""" + Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0) with fused + projection layers. This is used in the HunyuanDiT model. It applies a s normalization layer and rotary embedding on + query and key vector. + """ + + def __init__(self): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError( + "FusedHunyuanAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0." + ) + + def __call__( + self, + attn: Attention, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, + image_rotary_emb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + from .embeddings import apply_rotary_emb + + residual = hidden_states + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + + if attention_mask is not None: + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + # scaled_dot_product_attention expects attention_mask shape to be + # (batch, heads, source_length, target_length) + attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + if encoder_hidden_states is None: + qkv = attn.to_qkv(hidden_states) + split_size = qkv.shape[-1] // 3 + query, key, value = torch.split(qkv, split_size, dim=-1) + else: + if attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + query = attn.to_q(hidden_states) + + kv = attn.to_kv(encoder_hidden_states) + split_size = kv.shape[-1] // 2 + key, value = torch.split(kv, split_size, dim=-1) + + inner_dim = key.shape[-1] + head_dim = inner_dim // attn.heads + + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + if attn.norm_q is not None: + query = attn.norm_q(query) + if attn.norm_k is not None: + key = attn.norm_k(key) + + # Apply RoPE if needed + if image_rotary_emb is not None: + query = apply_rotary_emb(query, image_rotary_emb) + if not attn.is_cross_attention: + key = apply_rotary_emb(key, image_rotary_emb) + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + # TODO: add support for attn.scale when we move to Torch 2.1 + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + class LuminaAttnProcessor2_0: r""" Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is @@ -2190,7 +2307,7 @@ def __call__( (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype ) - for i in range(batch_size_attention // self.slice_size): + for i in range((batch_size_attention - 1) // self.slice_size + 1): start_idx = i * self.slice_size end_idx = (i + 1) * self.slice_size @@ -2287,7 +2404,7 @@ def __call__( (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype ) - for i in range(batch_size_attention // self.slice_size): + for i in range((batch_size_attention - 1) // self.slice_size + 1): start_idx = i * self.slice_size end_idx = (i + 1) * self.slice_size @@ -2845,12 +2962,6 @@ def __call__( # perturbed path (identity attention) batch_size, sequence_length, _ = hidden_states_ptb.shape - if attention_mask is not None: - attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) - # scaled_dot_product_attention expects attention_mask shape to be - # (batch, heads, source_length, target_length) - attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) - if attn.group_norm is not None: hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2) @@ -2953,12 +3064,6 @@ def __call__( # perturbed path (identity attention) batch_size, sequence_length, _ = hidden_states_ptb.shape - if attention_mask is not None: - attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) - # scaled_dot_product_attention expects attention_mask shape to be - # (batch, heads, source_length, target_length) - attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) - if attn.group_norm is not None: hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py index 0d2f5f74d325..161770c67cf8 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl.py @@ -26,6 +26,7 @@ AttentionProcessor, AttnAddedKVProcessor, AttnProcessor, + FusedAttnProcessor2_0, ) from ..modeling_outputs import AutoencoderKLOutput from ..modeling_utils import ModelMixin @@ -62,6 +63,9 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin): If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE can be fine-tuned / trained to a lower range without loosing too much precision in which case `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix + mid_block_add_attention (`bool`, *optional*, default to `True`): + If enabled, the mid_block of the Encoder and Decoder will have attention blocks. If set to false, the + mid_block will only have resnet blocks """ _supports_gradient_checkpointing = True @@ -87,6 +91,7 @@ def __init__( force_upcast: float = True, use_quant_conv: bool = True, use_post_quant_conv: bool = True, + mid_block_add_attention: bool = True, ): super().__init__() @@ -100,6 +105,7 @@ def __init__( act_fn=act_fn, norm_num_groups=norm_num_groups, double_z=True, + mid_block_add_attention=mid_block_add_attention, ) # pass init params to Decoder @@ -111,6 +117,7 @@ def __init__( layers_per_block=layers_per_block, norm_num_groups=norm_num_groups, act_fn=act_fn, + mid_block_add_attention=mid_block_add_attention, ) self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1) if use_quant_conv else None @@ -486,6 +493,8 @@ def fuse_qkv_projections(self): if isinstance(module, Attention): module.fuse_projections(fuse=True) + self.set_attn_processor(FusedAttnProcessor2_0()) + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections def unfuse_qkv_projections(self): """Disables the fused QKV projection if enabled. diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py index 8fb49d7b547f..d3ae96605077 100644 --- a/src/diffusers/models/controlnet.py +++ b/src/diffusers/models/controlnet.py @@ -830,7 +830,6 @@ def forward( sample = self.mid_block(sample, emb) # 5. Control net blocks - controlnet_down_block_res_samples = () for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks): diff --git a/src/diffusers/models/controlnet_sd3.py b/src/diffusers/models/controlnet_sd3.py index 25eb6384c68c..305401164b2f 100644 --- a/src/diffusers/models/controlnet_sd3.py +++ b/src/diffusers/models/controlnet_sd3.py @@ -22,7 +22,7 @@ from ..configuration_utils import ConfigMixin, register_to_config from ..loaders import FromOriginalModelMixin, PeftAdapterMixin from ..models.attention import JointTransformerBlock -from ..models.attention_processor import Attention, AttentionProcessor +from ..models.attention_processor import Attention, AttentionProcessor, FusedJointAttnProcessor2_0 from ..models.modeling_outputs import Transformer2DModelOutput from ..models.modeling_utils import ModelMixin from ..utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers @@ -196,7 +196,7 @@ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): for name, module in self.named_children(): fn_recursive_attn_processor(name, module, processor) - # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections + # Copied from diffusers.models.transformers.transformer_sd3.SD3Transformer2DModel.fuse_qkv_projections def fuse_qkv_projections(self): """ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) @@ -220,6 +220,8 @@ def fuse_qkv_projections(self): if isinstance(module, Attention): module.fuse_projections(fuse=True) + self.set_attn_processor(FusedJointAttnProcessor2_0()) + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections def unfuse_qkv_projections(self): """Disables the fused QKV projection if enabled. diff --git a/src/diffusers/models/controlnet_sparsectrl.py b/src/diffusers/models/controlnet_sparsectrl.py new file mode 100644 index 000000000000..bc1273aaab7d --- /dev/null +++ b/src/diffusers/models/controlnet_sparsectrl.py @@ -0,0 +1,791 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +from torch import nn +from torch.nn import functional as F + +from ..configuration_utils import ConfigMixin, register_to_config +from ..utils import BaseOutput, logging +from .attention_processor import ( + ADDED_KV_ATTENTION_PROCESSORS, + CROSS_ATTENTION_PROCESSORS, + AttentionProcessor, + AttnAddedKVProcessor, + AttnProcessor, +) +from .embeddings import TimestepEmbedding, Timesteps +from .modeling_utils import ModelMixin +from .unets.unet_2d_blocks import UNetMidBlock2DCrossAttn +from .unets.unet_2d_condition import UNet2DConditionModel +from .unets.unet_3d_blocks import ( + CrossAttnDownBlockMotion, + DownBlockMotion, +) + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class SparseControlNetOutput(BaseOutput): + """ + The output of [`SparseControlNetModel`]. + + Args: + down_block_res_samples (`tuple[torch.Tensor]`): + A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should + be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be + used to condition the original UNet's downsampling activations. + mid_down_block_re_sample (`torch.Tensor`): + The activation of the middle block (the lowest sample resolution). Each tensor should be of shape + `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`. + Output can be used to condition the original UNet's middle block activation. + """ + + down_block_res_samples: Tuple[torch.Tensor] + mid_block_res_sample: torch.Tensor + + +class SparseControlNetConditioningEmbedding(nn.Module): + def __init__( + self, + conditioning_embedding_channels: int, + conditioning_channels: int = 3, + block_out_channels: Tuple[int, ...] = (16, 32, 96, 256), + ): + super().__init__() + + self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1) + self.blocks = nn.ModuleList([]) + + for i in range(len(block_out_channels) - 1): + channel_in = block_out_channels[i] + channel_out = block_out_channels[i + 1] + self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1)) + self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2)) + + self.conv_out = zero_module( + nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1) + ) + + def forward(self, conditioning: torch.Tensor) -> torch.Tensor: + embedding = self.conv_in(conditioning) + embedding = F.silu(embedding) + + for block in self.blocks: + embedding = block(embedding) + embedding = F.silu(embedding) + + embedding = self.conv_out(embedding) + return embedding + + +class SparseControlNetModel(ModelMixin, ConfigMixin): + """ + A SparseControlNet model as described in [SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion + Models](https://arxiv.org/abs/2311.16933). + + Args: + in_channels (`int`, defaults to 4): + The number of channels in the input sample. + conditioning_channels (`int`, defaults to 4): + The number of input channels in the controlnet conditional embedding module. If + `concat_condition_embedding` is True, the value provided here is incremented by 1. + flip_sin_to_cos (`bool`, defaults to `True`): + Whether to flip the sin to cos in the time embedding. + freq_shift (`int`, defaults to 0): + The frequency shift to apply to the time embedding. + down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`): + The tuple of downsample blocks to use. + only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`): + block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`): + The tuple of output channels for each block. + layers_per_block (`int`, defaults to 2): + The number of layers per block. + downsample_padding (`int`, defaults to 1): + The padding to use for the downsampling convolution. + mid_block_scale_factor (`float`, defaults to 1): + The scale factor to use for the mid block. + act_fn (`str`, defaults to "silu"): + The activation function to use. + norm_num_groups (`int`, *optional*, defaults to 32): + The number of groups to use for the normalization. If None, normalization and activation layers is skipped + in post-processing. + norm_eps (`float`, defaults to 1e-5): + The epsilon to use for the normalization. + cross_attention_dim (`int`, defaults to 1280): + The dimension of the cross attention features. + transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1): + The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for + [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`], + [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`]. + transformer_layers_per_mid_block (`int` or `Tuple[int]`, *optional*, defaults to 1): + The number of transformer layers to use in each layer in the middle block. + attention_head_dim (`int` or `Tuple[int]`, defaults to 8): + The dimension of the attention heads. + num_attention_heads (`int` or `Tuple[int]`, *optional*): + The number of heads to use for multi-head attention. + use_linear_projection (`bool`, defaults to `False`): + upcast_attention (`bool`, defaults to `False`): + resnet_time_scale_shift (`str`, defaults to `"default"`): + Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`. + conditioning_embedding_out_channels (`Tuple[int]`, defaults to `(16, 32, 96, 256)`): + The tuple of output channel for each block in the `conditioning_embedding` layer. + global_pool_conditions (`bool`, defaults to `False`): + TODO(Patrick) - unused parameter + controlnet_conditioning_channel_order (`str`, defaults to `rgb`): + motion_max_seq_length (`int`, defaults to `32`): + The maximum sequence length to use in the motion module. + motion_num_attention_heads (`int` or `Tuple[int]`, defaults to `8`): + The number of heads to use in each attention layer of the motion module. + concat_conditioning_mask (`bool`, defaults to `True`): + use_simplified_condition_embedding (`bool`, defaults to `True`): + """ + + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + in_channels: int = 4, + conditioning_channels: int = 4, + flip_sin_to_cos: bool = True, + freq_shift: int = 0, + down_block_types: Tuple[str, ...] = ( + "CrossAttnDownBlockMotion", + "CrossAttnDownBlockMotion", + "CrossAttnDownBlockMotion", + "DownBlockMotion", + ), + only_cross_attention: Union[bool, Tuple[bool]] = False, + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), + layers_per_block: int = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + act_fn: str = "silu", + norm_num_groups: Optional[int] = 32, + norm_eps: float = 1e-5, + cross_attention_dim: int = 768, + transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1, + transformer_layers_per_mid_block: Optional[Union[int, Tuple[int]]] = None, + temporal_transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1, + attention_head_dim: Union[int, Tuple[int, ...]] = 8, + num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None, + use_linear_projection: bool = False, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256), + global_pool_conditions: bool = False, + controlnet_conditioning_channel_order: str = "rgb", + motion_max_seq_length: int = 32, + motion_num_attention_heads: int = 8, + concat_conditioning_mask: bool = True, + use_simplified_condition_embedding: bool = True, + ): + super().__init__() + self.use_simplified_condition_embedding = use_simplified_condition_embedding + + # If `num_attention_heads` is not defined (which is the case for most models) + # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. + # The reason for this behavior is to correct for incorrectly named variables that were introduced + # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 + # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking + # which is why we correct for the naming here. + num_attention_heads = num_attention_heads or attention_head_dim + + # Check inputs + if len(block_out_channels) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." + ) + + if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." + ) + + if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." + ) + + if isinstance(transformer_layers_per_block, int): + transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types) + if isinstance(temporal_transformer_layers_per_block, int): + temporal_transformer_layers_per_block = [temporal_transformer_layers_per_block] * len(down_block_types) + + # input + conv_in_kernel = 3 + conv_in_padding = (conv_in_kernel - 1) // 2 + self.conv_in = nn.Conv2d( + in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding + ) + + if concat_conditioning_mask: + conditioning_channels = conditioning_channels + 1 + + self.concat_conditioning_mask = concat_conditioning_mask + + # control net conditioning embedding + if use_simplified_condition_embedding: + self.controlnet_cond_embedding = zero_module( + nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1) + ) + else: + self.controlnet_cond_embedding = SparseControlNetConditioningEmbedding( + conditioning_embedding_channels=block_out_channels[0], + block_out_channels=conditioning_embedding_out_channels, + conditioning_channels=conditioning_channels, + ) + + # time + time_embed_dim = block_out_channels[0] * 4 + self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) + timestep_input_dim = block_out_channels[0] + + self.time_embedding = TimestepEmbedding( + timestep_input_dim, + time_embed_dim, + act_fn=act_fn, + ) + + self.down_blocks = nn.ModuleList([]) + self.controlnet_down_blocks = nn.ModuleList([]) + + if isinstance(cross_attention_dim, int): + cross_attention_dim = (cross_attention_dim,) * len(down_block_types) + + if isinstance(only_cross_attention, bool): + only_cross_attention = [only_cross_attention] * len(down_block_types) + + if isinstance(attention_head_dim, int): + attention_head_dim = (attention_head_dim,) * len(down_block_types) + + if isinstance(num_attention_heads, int): + num_attention_heads = (num_attention_heads,) * len(down_block_types) + + if isinstance(motion_num_attention_heads, int): + motion_num_attention_heads = (motion_num_attention_heads,) * len(down_block_types) + + # down + output_channel = block_out_channels[0] + + controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1) + controlnet_block = zero_module(controlnet_block) + self.controlnet_down_blocks.append(controlnet_block) + + for i, down_block_type in enumerate(down_block_types): + input_channel = output_channel + output_channel = block_out_channels[i] + is_final_block = i == len(block_out_channels) - 1 + + if down_block_type == "CrossAttnDownBlockMotion": + down_block = CrossAttnDownBlockMotion( + in_channels=input_channel, + out_channels=output_channel, + temb_channels=time_embed_dim, + dropout=0, + num_layers=layers_per_block, + transformer_layers_per_block=transformer_layers_per_block[i], + resnet_eps=norm_eps, + resnet_time_scale_shift=resnet_time_scale_shift, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + resnet_pre_norm=True, + num_attention_heads=num_attention_heads[i], + cross_attention_dim=cross_attention_dim[i], + add_downsample=not is_final_block, + dual_cross_attention=False, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + temporal_num_attention_heads=motion_num_attention_heads[i], + temporal_max_seq_length=motion_max_seq_length, + temporal_transformer_layers_per_block=temporal_transformer_layers_per_block[i], + temporal_double_self_attention=False, + ) + elif down_block_type == "DownBlockMotion": + down_block = DownBlockMotion( + in_channels=input_channel, + out_channels=output_channel, + temb_channels=time_embed_dim, + dropout=0, + num_layers=layers_per_block, + resnet_eps=norm_eps, + resnet_time_scale_shift=resnet_time_scale_shift, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + resnet_pre_norm=True, + add_downsample=not is_final_block, + temporal_num_attention_heads=motion_num_attention_heads[i], + temporal_max_seq_length=motion_max_seq_length, + temporal_double_self_attention=False, + temporal_transformer_layers_per_block=temporal_transformer_layers_per_block[i], + ) + else: + raise ValueError( + "Invalid `block_type` encountered. Must be one of `CrossAttnDownBlockMotion` or `DownBlockMotion`" + ) + + self.down_blocks.append(down_block) + + for _ in range(layers_per_block): + controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1) + controlnet_block = zero_module(controlnet_block) + self.controlnet_down_blocks.append(controlnet_block) + + if not is_final_block: + controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1) + controlnet_block = zero_module(controlnet_block) + self.controlnet_down_blocks.append(controlnet_block) + + # mid + mid_block_channels = block_out_channels[-1] + + controlnet_block = nn.Conv2d(mid_block_channels, mid_block_channels, kernel_size=1) + controlnet_block = zero_module(controlnet_block) + self.controlnet_mid_block = controlnet_block + + if transformer_layers_per_mid_block is None: + transformer_layers_per_mid_block = ( + transformer_layers_per_block[-1] if isinstance(transformer_layers_per_block[-1], int) else 1 + ) + + self.mid_block = UNetMidBlock2DCrossAttn( + in_channels=mid_block_channels, + temb_channels=time_embed_dim, + dropout=0, + num_layers=1, + transformer_layers_per_block=transformer_layers_per_mid_block, + resnet_eps=norm_eps, + resnet_time_scale_shift=resnet_time_scale_shift, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + resnet_pre_norm=True, + num_attention_heads=num_attention_heads[-1], + output_scale_factor=mid_block_scale_factor, + cross_attention_dim=cross_attention_dim[-1], + dual_cross_attention=False, + use_linear_projection=use_linear_projection, + upcast_attention=upcast_attention, + attention_type="default", + ) + + @classmethod + def from_unet( + cls, + unet: UNet2DConditionModel, + controlnet_conditioning_channel_order: str = "rgb", + conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256), + load_weights_from_unet: bool = True, + conditioning_channels: int = 3, + ) -> "SparseControlNetModel": + r""" + Instantiate a [`SparseControlNetModel`] from [`UNet2DConditionModel`]. + + Parameters: + unet (`UNet2DConditionModel`): + The UNet model weights to copy to the [`SparseControlNetModel`]. All configuration options are also + copied where applicable. + """ + transformer_layers_per_block = ( + unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1 + ) + down_block_types = unet.config.down_block_types + + for i in range(len(down_block_types)): + if "CrossAttn" in down_block_types[i]: + down_block_types[i] = "CrossAttnDownBlockMotion" + elif "Down" in down_block_types[i]: + down_block_types[i] = "DownBlockMotion" + else: + raise ValueError("Invalid `block_type` encountered. Must be a cross-attention or down block") + + controlnet = cls( + in_channels=unet.config.in_channels, + conditioning_channels=conditioning_channels, + flip_sin_to_cos=unet.config.flip_sin_to_cos, + freq_shift=unet.config.freq_shift, + down_block_types=unet.config.down_block_types, + only_cross_attention=unet.config.only_cross_attention, + block_out_channels=unet.config.block_out_channels, + layers_per_block=unet.config.layers_per_block, + downsample_padding=unet.config.downsample_padding, + mid_block_scale_factor=unet.config.mid_block_scale_factor, + act_fn=unet.config.act_fn, + norm_num_groups=unet.config.norm_num_groups, + norm_eps=unet.config.norm_eps, + cross_attention_dim=unet.config.cross_attention_dim, + transformer_layers_per_block=transformer_layers_per_block, + attention_head_dim=unet.config.attention_head_dim, + num_attention_heads=unet.config.num_attention_heads, + use_linear_projection=unet.config.use_linear_projection, + upcast_attention=unet.config.upcast_attention, + resnet_time_scale_shift=unet.config.resnet_time_scale_shift, + conditioning_embedding_out_channels=conditioning_embedding_out_channels, + controlnet_conditioning_channel_order=controlnet_conditioning_channel_order, + ) + + if load_weights_from_unet: + controlnet.conv_in.load_state_dict(unet.conv_in.state_dict(), strict=False) + controlnet.time_proj.load_state_dict(unet.time_proj.state_dict(), strict=False) + controlnet.time_embedding.load_state_dict(unet.time_embedding.state_dict(), strict=False) + controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict(), strict=False) + controlnet.mid_block.load_state_dict(unet.mid_block.state_dict(), strict=False) + + return controlnet + + @property + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]): + if hasattr(module, "get_processor"): + processors[f"{name}.processor"] = module.get_processor() + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor + def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): + r""" + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor + def set_default_attn_processor(self): + """ + Disables custom attention processors and sets the default attention implementation. + """ + if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnAddedKVProcessor() + elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnProcessor() + else: + raise ValueError( + f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}" + ) + + self.set_attn_processor(processor) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice + def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None: + r""" + Enable sliced attention computation. + + When this option is enabled, the attention module splits the input tensor in slices to compute attention in + several steps. This is useful for saving some memory in exchange for a small decrease in speed. + + Args: + slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): + When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If + `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. + """ + sliceable_head_dims = [] + + def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module): + if hasattr(module, "set_attention_slice"): + sliceable_head_dims.append(module.sliceable_head_dim) + + for child in module.children(): + fn_recursive_retrieve_sliceable_dims(child) + + # retrieve number of attention layers + for module in self.children(): + fn_recursive_retrieve_sliceable_dims(module) + + num_sliceable_layers = len(sliceable_head_dims) + + if slice_size == "auto": + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = [dim // 2 for dim in sliceable_head_dims] + elif slice_size == "max": + # make smallest slice possible + slice_size = num_sliceable_layers * [1] + + slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size + + if len(slice_size) != len(sliceable_head_dims): + raise ValueError( + f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different" + f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}." + ) + + for i in range(len(slice_size)): + size = slice_size[i] + dim = sliceable_head_dims[i] + if size is not None and size > dim: + raise ValueError(f"size {size} has to be smaller or equal to {dim}.") + + # Recursively walk through all the children. + # Any children which exposes the set_attention_slice method + # gets the message + def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]): + if hasattr(module, "set_attention_slice"): + module.set_attention_slice(slice_size.pop()) + + for child in module.children(): + fn_recursive_set_attention_slice(child, slice_size) + + reversed_slice_size = list(reversed(slice_size)) + for module in self.children(): + fn_recursive_set_attention_slice(module, reversed_slice_size) + + def _set_gradient_checkpointing(self, module, value: bool = False) -> None: + if isinstance(module, (CrossAttnDownBlockMotion, DownBlockMotion, UNetMidBlock2DCrossAttn)): + module.gradient_checkpointing = value + + def forward( + self, + sample: torch.Tensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + controlnet_cond: torch.Tensor, + conditioning_scale: float = 1.0, + timestep_cond: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + conditioning_mask: Optional[torch.Tensor] = None, + guess_mode: bool = False, + return_dict: bool = True, + ) -> Union[SparseControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]: + """ + The [`SparseControlNetModel`] forward method. + + Args: + sample (`torch.Tensor`): + The noisy input tensor. + timestep (`Union[torch.Tensor, float, int]`): + The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): + The encoder hidden states. + controlnet_cond (`torch.Tensor`): + The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. + conditioning_scale (`float`, defaults to `1.0`): + The scale factor for ControlNet outputs. + class_labels (`torch.Tensor`, *optional*, defaults to `None`): + Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. + timestep_cond (`torch.Tensor`, *optional*, defaults to `None`): + Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the + timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep + embeddings. + attention_mask (`torch.Tensor`, *optional*, defaults to `None`): + An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask + is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large + negative values to the attention scores corresponding to "discard" tokens. + added_cond_kwargs (`dict`): + Additional conditions for the Stable Diffusion XL UNet. + cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`): + A kwargs dictionary that if specified is passed along to the `AttnProcessor`. + guess_mode (`bool`, defaults to `False`): + In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if + you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended. + return_dict (`bool`, defaults to `True`): + Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple. + Returns: + [`~models.controlnet.ControlNetOutput`] **or** `tuple`: + If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is + returned where the first element is the sample tensor. + """ + sample_batch_size, sample_channels, sample_num_frames, sample_height, sample_width = sample.shape + sample = torch.zeros_like(sample) + + # check channel order + channel_order = self.config.controlnet_conditioning_channel_order + + if channel_order == "rgb": + # in rgb order by default + ... + elif channel_order == "bgr": + controlnet_cond = torch.flip(controlnet_cond, dims=[1]) + else: + raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}") + + # prepare attention_mask + if attention_mask is not None: + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # 1. time + timesteps = timestep + if not torch.is_tensor(timesteps): + # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can + # This would be a good case for the `match` statement (Python 3.10+) + is_mps = sample.device.type == "mps" + if isinstance(timestep, float): + dtype = torch.float32 if is_mps else torch.float64 + else: + dtype = torch.int32 if is_mps else torch.int64 + timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device) + elif len(timesteps.shape) == 0: + timesteps = timesteps[None].to(sample.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timesteps = timesteps.expand(sample.shape[0]) + + t_emb = self.time_proj(timesteps) + + # timesteps does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=sample.dtype) + + emb = self.time_embedding(t_emb, timestep_cond) + emb = emb.repeat_interleave(sample_num_frames, dim=0) + encoder_hidden_states = encoder_hidden_states.repeat_interleave(sample_num_frames, dim=0) + + # 2. pre-process + batch_size, channels, num_frames, height, width = sample.shape + + sample = sample.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width) + sample = self.conv_in(sample) + + batch_frames, channels, height, width = sample.shape + sample = sample[:, None].reshape(sample_batch_size, sample_num_frames, channels, height, width) + + if self.concat_conditioning_mask: + controlnet_cond = torch.cat([controlnet_cond, conditioning_mask], dim=1) + + batch_size, channels, num_frames, height, width = controlnet_cond.shape + controlnet_cond = controlnet_cond.permute(0, 2, 1, 3, 4).reshape( + batch_size * num_frames, channels, height, width + ) + controlnet_cond = self.controlnet_cond_embedding(controlnet_cond) + batch_frames, channels, height, width = controlnet_cond.shape + controlnet_cond = controlnet_cond[:, None].reshape(batch_size, num_frames, channels, height, width) + + sample = sample + controlnet_cond + + batch_size, num_frames, channels, height, width = sample.shape + sample = sample.reshape(sample_batch_size * sample_num_frames, channels, height, width) + + # 3. down + down_block_res_samples = (sample,) + for downsample_block in self.down_blocks: + if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + num_frames=num_frames, + cross_attention_kwargs=cross_attention_kwargs, + ) + else: + sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames) + + down_block_res_samples += res_samples + + # 4. mid + if self.mid_block is not None: + if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention: + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + ) + else: + sample = self.mid_block(sample, emb) + + # 5. Control net blocks + controlnet_down_block_res_samples = () + + for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks): + down_block_res_sample = controlnet_block(down_block_res_sample) + controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,) + + down_block_res_samples = controlnet_down_block_res_samples + mid_block_res_sample = self.controlnet_mid_block(sample) + + # 6. scaling + if guess_mode and not self.config.global_pool_conditions: + scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device) # 0.1 to 1.0 + scales = scales * conditioning_scale + down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)] + mid_block_res_sample = mid_block_res_sample * scales[-1] # last one + else: + down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples] + mid_block_res_sample = mid_block_res_sample * conditioning_scale + + if self.config.global_pool_conditions: + down_block_res_samples = [ + torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples + ] + mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True) + + if not return_dict: + return (down_block_res_samples, mid_block_res_sample) + + return SparseControlNetOutput( + down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample + ) + + +# Copied from diffusers.models.controlnet.zero_module +def zero_module(module: nn.Module) -> nn.Module: + for p in module.parameters(): + nn.init.zeros_(p) + return module diff --git a/src/diffusers/models/controlnet_xs.py b/src/diffusers/models/controlnet_xs.py index 354acfebe0a2..0fa21755f09c 100644 --- a/src/diffusers/models/controlnet_xs.py +++ b/src/diffusers/models/controlnet_xs.py @@ -29,6 +29,7 @@ AttentionProcessor, AttnAddedKVProcessor, AttnProcessor, + FusedAttnProcessor2_0, ) from .controlnet import ControlNetConditioningEmbedding from .embeddings import TimestepEmbedding, Timesteps @@ -1001,6 +1002,8 @@ def fuse_qkv_projections(self): if isinstance(module, Attention): module.fuse_projections(fuse=True) + self.set_attn_processor(FusedAttnProcessor2_0()) + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections def unfuse_qkv_projections(self): """Disables the fused QKV projection if enabled. diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index 0890842f5775..7684fdf9cd6c 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -319,12 +319,16 @@ def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False): assert embed_dim % 4 == 0 # use half of dimensions to encode grid_h - emb_h = get_1d_rotary_pos_embed(embed_dim // 2, grid[0].reshape(-1), use_real=use_real) # (H*W, D/4) - emb_w = get_1d_rotary_pos_embed(embed_dim // 2, grid[1].reshape(-1), use_real=use_real) # (H*W, D/4) + emb_h = get_1d_rotary_pos_embed( + embed_dim // 2, grid[0].reshape(-1), use_real=use_real + ) # (H*W, D/2) if use_real else (H*W, D/4) + emb_w = get_1d_rotary_pos_embed( + embed_dim // 2, grid[1].reshape(-1), use_real=use_real + ) # (H*W, D/2) if use_real else (H*W, D/4) if use_real: - cos = torch.cat([emb_h[0], emb_w[0]], dim=1) # (H*W, D/2) - sin = torch.cat([emb_h[1], emb_w[1]], dim=1) # (H*W, D/2) + cos = torch.cat([emb_h[0], emb_w[0]], dim=1) # (H*W, D) + sin = torch.cat([emb_h[1], emb_w[1]], dim=1) # (H*W, D) return cos, sin else: emb = torch.cat([emb_h, emb_w], dim=1) # (H*W, D/2) @@ -371,6 +375,8 @@ def get_1d_rotary_pos_embed( Returns: `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2] """ + assert dim % 2 == 0 + if isinstance(pos, int): pos = np.arange(pos) theta = theta * ntk_factor diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index ebd356d981d6..969eb5f5fa37 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -191,7 +191,6 @@ def _fetch_index_file( cache_dir, variant, force_download, - resume_download, proxies, local_files_only, token, @@ -216,7 +215,6 @@ def _fetch_index_file( weights_name=index_file_in_repo, cache_dir=cache_dir, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, diff --git a/src/diffusers/models/modeling_flax_utils.py b/src/diffusers/models/modeling_flax_utils.py index 151281070faa..8c35fab0fc16 100644 --- a/src/diffusers/models/modeling_flax_utils.py +++ b/src/diffusers/models/modeling_flax_utils.py @@ -245,9 +245,7 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -296,7 +294,6 @@ def from_pretrained( cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) from_pt = kwargs.pop("from_pt", False) - resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) token = kwargs.pop("token", None) @@ -316,7 +313,6 @@ def from_pretrained( cache_dir=cache_dir, return_unused_kwargs=True, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -362,7 +358,6 @@ def from_pretrained( cache_dir=cache_dir, force_download=force_download, proxies=proxies, - resume_download=resume_download, local_files_only=local_files_only, token=token, user_agent=user_agent, diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 3fca24c0fc15..f7324009f3c6 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -434,9 +434,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -518,7 +515,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False) force_download = kwargs.pop("force_download", False) from_flax = kwargs.pop("from_flax", False) - resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) output_loading_info = kwargs.pop("output_loading_info", False) local_files_only = kwargs.pop("local_files_only", None) @@ -619,7 +615,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P return_unused_kwargs=True, return_commit_hash=True, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -641,7 +636,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P cache_dir=cache_dir, variant=variant, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -663,7 +657,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P weights_name=FLAX_WEIGHTS_NAME, cache_dir=cache_dir, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -685,7 +678,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P index_file, cache_dir=cache_dir, proxies=proxies, - resume_download=resume_download, local_files_only=local_files_only, token=token, user_agent=user_agent, @@ -700,7 +692,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant), cache_dir=cache_dir, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -724,7 +715,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P weights_name=_add_variant(WEIGHTS_NAME, variant), cache_dir=cache_dir, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -1177,7 +1167,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) token = kwargs.pop("token", None) @@ -1200,7 +1189,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P return_unused_kwargs=True, return_commit_hash=True, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, diff --git a/src/diffusers/models/transformers/auraflow_transformer_2d.py b/src/diffusers/models/transformers/auraflow_transformer_2d.py index 342373b4c11d..89d51969aeaa 100644 --- a/src/diffusers/models/transformers/auraflow_transformer_2d.py +++ b/src/diffusers/models/transformers/auraflow_transformer_2d.py @@ -138,14 +138,14 @@ def __init__(self, dim, num_attention_heads, attention_head_dim): self.norm2 = FP32LayerNorm(dim, elementwise_affine=False, bias=False) self.ff = AuraFlowFeedForward(dim, dim * 4) - def forward(self, hidden_states: torch.FloatTensor, temb: torch.FloatTensor, i=9999): + def forward(self, hidden_states: torch.FloatTensor, temb: torch.FloatTensor): residual = hidden_states # Norm + Projection. norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb) # Attention. - attn_output = self.attn(hidden_states=norm_hidden_states, i=i) + attn_output = self.attn(hidden_states=norm_hidden_states) # Process attention outputs for the `hidden_states`. hidden_states = self.norm2(residual + gate_msa.unsqueeze(1) * attn_output) @@ -201,7 +201,7 @@ def __init__(self, dim, num_attention_heads, attention_head_dim): self.ff_context = AuraFlowFeedForward(dim, dim * 4) def forward( - self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor, i=0 + self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor ): residual = hidden_states residual_context = encoder_hidden_states @@ -214,7 +214,7 @@ def forward( # Attention. attn_output, context_attn_output = self.attn( - hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states, i=i + hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states ) # Process attention outputs for the `hidden_states`. @@ -366,7 +366,7 @@ def custom_forward(*inputs): else: encoder_hidden_states, hidden_states = block( - hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, temb=temb, i=index_block + hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, temb=temb ) # Single DiT blocks that combine the `hidden_states` (image) and `encoder_hidden_states` (text) diff --git a/src/diffusers/models/transformers/hunyuan_transformer_2d.py b/src/diffusers/models/transformers/hunyuan_transformer_2d.py index cc0dcbd79e9f..7f3dab220aaa 100644 --- a/src/diffusers/models/transformers/hunyuan_transformer_2d.py +++ b/src/diffusers/models/transformers/hunyuan_transformer_2d.py @@ -20,7 +20,7 @@ from ...utils import logging from ...utils.torch_utils import maybe_allow_in_graph from ..attention import FeedForward -from ..attention_processor import Attention, AttentionProcessor, HunyuanAttnProcessor2_0 +from ..attention_processor import Attention, AttentionProcessor, FusedHunyuanAttnProcessor2_0, HunyuanAttnProcessor2_0 from ..embeddings import ( HunyuanCombinedTimestepTextSizeStyleEmbedding, PatchEmbed, @@ -317,7 +317,7 @@ def __init__( self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6) self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True) - # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedHunyuanAttnProcessor2_0 def fuse_qkv_projections(self): """ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) @@ -341,6 +341,8 @@ def fuse_qkv_projections(self): if isinstance(module, Attention): module.fuse_projections(fuse=True) + self.set_attn_processor(FusedHunyuanAttnProcessor2_0()) + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections def unfuse_qkv_projections(self): """Disables the fused QKV projection if enabled. diff --git a/src/diffusers/models/transformers/transformer_sd3.py b/src/diffusers/models/transformers/transformer_sd3.py index 1b9126b3b849..9376c91d0756 100644 --- a/src/diffusers/models/transformers/transformer_sd3.py +++ b/src/diffusers/models/transformers/transformer_sd3.py @@ -13,8 +13,6 @@ # limitations under the License. -import inspect -from functools import partial from typing import Any, Dict, List, Optional, Union import torch @@ -23,7 +21,7 @@ from ...configuration_utils import ConfigMixin, register_to_config from ...loaders import FromOriginalModelMixin, PeftAdapterMixin from ...models.attention import JointTransformerBlock -from ...models.attention_processor import Attention, AttentionProcessor +from ...models.attention_processor import Attention, AttentionProcessor, FusedJointAttnProcessor2_0 from ...models.modeling_utils import ModelMixin from ...models.normalization import AdaLayerNormContinuous from ...utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers @@ -139,6 +137,18 @@ def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int for module in self.children(): fn_recursive_feed_forward(module, chunk_size, dim) + # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.disable_forward_chunking + def disable_forward_chunking(self): + def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int): + if hasattr(module, "set_chunk_feed_forward"): + module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim) + + for child in module.children(): + fn_recursive_feed_forward(child, chunk_size, dim) + + for module in self.children(): + fn_recursive_feed_forward(module, None, 0) + @property # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors def attn_processors(self) -> Dict[str, AttentionProcessor]: @@ -199,7 +209,7 @@ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): for name, module in self.named_children(): fn_recursive_attn_processor(name, module, processor) - # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedJointAttnProcessor2_0 def fuse_qkv_projections(self): """ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) @@ -223,6 +233,8 @@ def fuse_qkv_projections(self): if isinstance(module, Attention): module.fuse_projections(fuse=True) + self.set_attn_processor(FusedJointAttnProcessor2_0()) + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections def unfuse_qkv_projections(self): """Disables the fused QKV projection if enabled. @@ -241,47 +253,6 @@ def _set_gradient_checkpointing(self, module, value=False): if hasattr(module, "gradient_checkpointing"): module.gradient_checkpointing = value - def fuse_lora(self, lora_scale=1.0, safe_fusing=False, adapter_names=None): - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for `fuse_lora()`.") - - self.lora_scale = lora_scale - self._safe_fusing = safe_fusing - self.apply(partial(self._fuse_lora_apply, adapter_names=adapter_names)) - - def _fuse_lora_apply(self, module, adapter_names=None): - from peft.tuners.tuners_utils import BaseTunerLayer - - merge_kwargs = {"safe_merge": self._safe_fusing} - - if isinstance(module, BaseTunerLayer): - if self.lora_scale != 1.0: - module.scale_layer(self.lora_scale) - - # For BC with prevous PEFT versions, we need to check the signature - # of the `merge` method to see if it supports the `adapter_names` argument. - supported_merge_kwargs = list(inspect.signature(module.merge).parameters) - if "adapter_names" in supported_merge_kwargs: - merge_kwargs["adapter_names"] = adapter_names - elif "adapter_names" not in supported_merge_kwargs and adapter_names is not None: - raise ValueError( - "The `adapter_names` argument is not supported with your PEFT version. Please upgrade" - " to the latest version of PEFT. `pip install -U peft`" - ) - - module.merge(**merge_kwargs) - - def unfuse_lora(self): - if not USE_PEFT_BACKEND: - raise ValueError("PEFT backend is required for `unfuse_lora()`.") - self.apply(self._unfuse_lora_apply) - - def _unfuse_lora_apply(self, module): - from peft.tuners.tuners_utils import BaseTunerLayer - - if isinstance(module, BaseTunerLayer): - module.unmerge() - def forward( self, hidden_states: torch.FloatTensor, diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py index 2b9122799bf3..9a168bd22c93 100644 --- a/src/diffusers/models/unets/unet_2d_condition.py +++ b/src/diffusers/models/unets/unet_2d_condition.py @@ -30,6 +30,7 @@ AttentionProcessor, AttnAddedKVProcessor, AttnProcessor, + FusedAttnProcessor2_0, ) from ..embeddings import ( GaussianFourierProjection, @@ -890,6 +891,8 @@ def fuse_qkv_projections(self): if isinstance(module, Attention): module.fuse_projections(fuse=True) + self.set_attn_processor(FusedAttnProcessor2_0()) + def unfuse_qkv_projections(self): """Disables the fused QKV projection if enabled. @@ -1024,6 +1027,10 @@ def process_encoder_hidden_states( raise ValueError( f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" ) + + if hasattr(self, "text_encoder_hid_proj") and self.text_encoder_hid_proj is not None: + encoder_hidden_states = self.text_encoder_hid_proj(encoder_hidden_states) + image_embeds = added_cond_kwargs.get("image_embeds") image_embeds = self.encoder_hid_proj(image_embeds) encoder_hidden_states = (encoder_hidden_states, image_embeds) diff --git a/src/diffusers/models/unets/unet_3d_blocks.py b/src/diffusers/models/unets/unet_3d_blocks.py index 0c3e3abe2087..51c743a14d40 100644 --- a/src/diffusers/models/unets/unet_3d_blocks.py +++ b/src/diffusers/models/unets/unet_3d_blocks.py @@ -966,6 +966,7 @@ def __init__( temporal_num_attention_heads: Union[int, Tuple[int]] = 1, temporal_cross_attention_dim: Optional[int] = None, temporal_max_seq_length: int = 32, + temporal_double_self_attention: bool = True, temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1, ): super().__init__() @@ -1016,6 +1017,7 @@ def __init__( positional_embeddings="sinusoidal", num_positional_embeddings=temporal_max_seq_length, attention_head_dim=out_channels // temporal_num_attention_heads[i], + double_self_attention=temporal_double_self_attention, ) ) @@ -1118,6 +1120,7 @@ def __init__( temporal_num_attention_heads: int = 8, temporal_max_seq_length: int = 32, temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1, + temporal_double_self_attention: bool = True, ): super().__init__() resnets = [] @@ -1199,6 +1202,7 @@ def __init__( positional_embeddings="sinusoidal", num_positional_embeddings=temporal_max_seq_length, attention_head_dim=out_channels // temporal_num_attention_heads, + double_self_attention=temporal_double_self_attention, ) ) @@ -1532,7 +1536,6 @@ def __init__( resnet_pre_norm: bool = True, output_scale_factor: float = 1.0, add_upsample: bool = True, - temporal_norm_num_groups: int = 32, temporal_cross_attention_dim: Optional[int] = None, temporal_num_attention_heads: int = 8, temporal_max_seq_length: int = 32, @@ -1574,7 +1577,7 @@ def __init__( num_attention_heads=temporal_num_attention_heads, in_channels=out_channels, num_layers=temporal_transformer_layers_per_block[i], - norm_num_groups=temporal_norm_num_groups, + norm_num_groups=resnet_groups, cross_attention_dim=temporal_cross_attention_dim, attention_bias=False, activation_fn="geglu", diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py index 40b3b92427ce..3081fdc4700c 100644 --- a/src/diffusers/models/unets/unet_3d_condition.py +++ b/src/diffusers/models/unets/unet_3d_condition.py @@ -31,6 +31,7 @@ AttentionProcessor, AttnAddedKVProcessor, AttnProcessor, + FusedAttnProcessor2_0, ) from ..embeddings import TimestepEmbedding, Timesteps from ..modeling_utils import ModelMixin @@ -532,6 +533,8 @@ def fuse_qkv_projections(self): if isinstance(module, Attention): module.fuse_projections(fuse=True) + self.set_attn_processor(FusedAttnProcessor2_0()) + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections def unfuse_qkv_projections(self): """Disables the fused QKV projection if enabled. diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py index b650f0e21af0..6ab3a577b892 100644 --- a/src/diffusers/models/unets/unet_i2vgen_xl.py +++ b/src/diffusers/models/unets/unet_i2vgen_xl.py @@ -29,6 +29,7 @@ AttentionProcessor, AttnAddedKVProcessor, AttnProcessor, + FusedAttnProcessor2_0, ) from ..embeddings import TimestepEmbedding, Timesteps from ..modeling_utils import ModelMixin @@ -498,6 +499,8 @@ def fuse_qkv_projections(self): if isinstance(module, Attention): module.fuse_projections(fuse=True) + self.set_attn_processor(FusedAttnProcessor2_0()) + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections def unfuse_qkv_projections(self): """Disables the fused QKV projection if enabled. diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py index 7e5209afba0c..196f947d599b 100644 --- a/src/diffusers/models/unets/unet_motion_model.py +++ b/src/diffusers/models/unets/unet_motion_model.py @@ -29,6 +29,7 @@ AttnAddedKVProcessor, AttnProcessor, AttnProcessor2_0, + FusedAttnProcessor2_0, IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0, ) @@ -929,6 +930,8 @@ def fuse_qkv_projections(self): if isinstance(module, Attention): module.fuse_projections(fuse=True) + self.set_attn_processor(FusedAttnProcessor2_0()) + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections def unfuse_qkv_projections(self): """Disables the fused QKV projection if enabled. diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 1d5fd5c2d094..7bc50b297566 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -119,6 +119,7 @@ _import_structure["animatediff"] = [ "AnimateDiffPipeline", "AnimateDiffSDXLPipeline", + "AnimateDiffSparseControlNetPipeline", "AnimateDiffVideoToVideoPipeline", ] _import_structure["audioldm"] = ["AudioLDMPipeline"] @@ -413,7 +414,12 @@ from ..utils.dummy_torch_and_transformers_objects import * else: from .amused import AmusedImg2ImgPipeline, AmusedInpaintPipeline, AmusedPipeline - from .animatediff import AnimateDiffPipeline, AnimateDiffSDXLPipeline, AnimateDiffVideoToVideoPipeline + from .animatediff import ( + AnimateDiffPipeline, + AnimateDiffSDXLPipeline, + AnimateDiffSparseControlNetPipeline, + AnimateDiffVideoToVideoPipeline, + ) from .audioldm import AudioLDMPipeline from .audioldm2 import ( AudioLDM2Pipeline, diff --git a/src/diffusers/pipelines/animatediff/__init__.py b/src/diffusers/pipelines/animatediff/__init__.py index ae6b67b1924c..5f4d6f29391c 100644 --- a/src/diffusers/pipelines/animatediff/__init__.py +++ b/src/diffusers/pipelines/animatediff/__init__.py @@ -23,6 +23,7 @@ else: _import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline"] _import_structure["pipeline_animatediff_sdxl"] = ["AnimateDiffSDXLPipeline"] + _import_structure["pipeline_animatediff_sparsectrl"] = ["AnimateDiffSparseControlNetPipeline"] _import_structure["pipeline_animatediff_video2video"] = ["AnimateDiffVideoToVideoPipeline"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: @@ -35,6 +36,7 @@ else: from .pipeline_animatediff import AnimateDiffPipeline from .pipeline_animatediff_sdxl import AnimateDiffSDXLPipeline + from .pipeline_animatediff_sparsectrl import AnimateDiffSparseControlNetPipeline from .pipeline_animatediff_video2video import AnimateDiffVideoToVideoPipeline from .pipeline_output import AnimateDiffPipelineOutput diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py index e09ef00b5cb7..68d30f14f160 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py @@ -19,7 +19,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import PipelineImageInput -from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel from ...models.lora import adjust_lora_scale_text_encoder from ...models.unets.unet_motion_model import MotionAdapter @@ -70,7 +70,7 @@ class AnimateDiffPipeline( StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, FreeInitMixin, ): r""" @@ -81,8 +81,8 @@ class AnimateDiffPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: @@ -184,7 +184,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -317,7 +317,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py new file mode 100644 index 000000000000..e9e0d518c806 --- /dev/null +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py @@ -0,0 +1,1008 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np +import PIL +import torch +import torch.nn.functional as F +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection + +from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel +from ...models.controlnet_sparsectrl import SparseControlNetModel +from ...models.lora import adjust_lora_scale_text_encoder +from ...models.unets.unet_motion_model import MotionAdapter +from ...schedulers import KarrasDiffusionSchedulers +from ...utils import ( + USE_PEFT_BACKEND, + logging, + replace_example_docstring, + scale_lora_layers, + unscale_lora_layers, +) +from ...utils.torch_utils import is_compiled_module, randn_tensor +from ...video_processor import VideoProcessor +from ..free_init_utils import FreeInitMixin +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin +from .pipeline_output import AnimateDiffPipelineOutput + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```python + >>> import torch + >>> from diffusers import AnimateDiffSparseControlNetPipeline + >>> from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel + >>> from diffusers.schedulers import DPMSolverMultistepScheduler + >>> from diffusers.utils import export_to_gif, load_image + + >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE" + >>> motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3" + >>> controlnet_id = "guoyww/animatediff-sparsectrl-scribble" + >>> lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3" + >>> vae_id = "stabilityai/sd-vae-ft-mse" + >>> device = "cuda" + + >>> motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device) + >>> controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device) + >>> vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device) + >>> scheduler = DPMSolverMultistepScheduler.from_pretrained( + ... model_id, + ... subfolder="scheduler", + ... beta_schedule="linear", + ... algorithm_type="dpmsolver++", + ... use_karras_sigmas=True, + ... ) + >>> pipe = AnimateDiffSparseControlNetPipeline.from_pretrained( + ... model_id, + ... motion_adapter=motion_adapter, + ... controlnet=controlnet, + ... vae=vae, + ... scheduler=scheduler, + ... torch_dtype=torch.float16, + ... ).to(device) + >>> pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora") + >>> pipe.fuse_lora(lora_scale=1.0) + + >>> prompt = "an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality" + >>> negative_prompt = "low quality, worst quality, letterboxed" + + >>> image_files = [ + ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-1.png", + ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-2.png", + ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-3.png", + ... ] + >>> condition_frame_indices = [0, 8, 15] + >>> conditioning_frames = [load_image(img_file) for img_file in image_files] + + >>> video = pipe( + ... prompt=prompt, + ... negative_prompt=negative_prompt, + ... num_inference_steps=25, + ... conditioning_frames=conditioning_frames, + ... controlnet_conditioning_scale=1.0, + ... controlnet_frame_indices=condition_frame_indices, + ... generator=torch.Generator().manual_seed(1337), + ... ).frames[0] + >>> export_to_gif(video, "output.gif") + ``` +""" + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents +def retrieve_latents( + encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" +): + if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": + return encoder_output.latent_dist.sample(generator) + elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": + return encoder_output.latent_dist.mode() + elif hasattr(encoder_output, "latents"): + return encoder_output.latents + else: + raise AttributeError("Could not access latents of provided encoder_output") + + +class AnimateDiffSparseControlNetPipeline( + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + IPAdapterMixin, + StableDiffusionLoraLoaderMixin, + FreeInitMixin, +): + r""" + Pipeline for controlled text-to-video generation using the method described in [SparseCtrl: Adding Sparse Controls + to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933). + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods + implemented for all pipelines (downloading, saving, running on a particular device, etc.). + + The pipeline also inherits the following loading methods: + - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). + tokenizer (`CLIPTokenizer`): + A [`~transformers.CLIPTokenizer`] to tokenize text. + unet ([`UNet2DConditionModel`]): + A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents. + motion_adapter ([`MotionAdapter`]): + A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + """ + + model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae" + _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"] + _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: Union[UNet2DConditionModel, UNetMotionModel], + motion_adapter: MotionAdapter, + controlnet: SparseControlNetModel, + scheduler: KarrasDiffusionSchedulers, + feature_extractor: CLIPImageProcessor = None, + image_encoder: CLIPVisionModelWithProjection = None, + ): + super().__init__() + if isinstance(unet, UNet2DConditionModel): + unet = UNetMotionModel.from_unet2d(unet, motion_adapter) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + motion_adapter=motion_adapter, + controlnet=controlnet, + scheduler=scheduler, + feature_extractor=feature_extractor, + image_encoder=image_encoder, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor) + self.control_image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False + ) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt + def encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + lora_scale: Optional[float] = None, + clip_skip: Optional[int] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if not USE_PEFT_BACKEND: + adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) + else: + scale_lora_layers(self.text_encoder, lora_scale) + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # textual inversion: process multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, self.tokenizer) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + if clip_skip is None: + prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask) + prompt_embeds = prompt_embeds[0] + else: + prompt_embeds = self.text_encoder( + text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True + ) + # Access the `hidden_states` first, that contains a tuple of + # all the hidden states from the encoder layers. Then index into + # the tuple to access the hidden states from the desired layer. + prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)] + # We also need to apply the final LayerNorm here to not mess with the + # representations. The `last_hidden_states` that we typically use for + # obtaining the final prompt representations passes through the LayerNorm + # layer. + prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + + if self.text_encoder is not None: + prompt_embeds_dtype = self.text_encoder.dtype + elif self.unet is not None: + prompt_embeds_dtype = self.unet.dtype + else: + prompt_embeds_dtype = prompt_embeds.dtype + + prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + # textual inversion: process multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + if self.text_encoder is not None: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder, lora_scale) + + return prompt_embeds, negative_prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance + ): + image_embeds = [] + if do_classifier_free_guidance: + negative_image_embeds = [] + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] + + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) + + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + + image_embeds.append(single_image_embeds[None, :]) + if do_classifier_free_guidance: + negative_image_embeds.append(single_negative_image_embeds[None, :]) + else: + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + negative_image_embeds.append(single_negative_image_embeds) + image_embeds.append(single_image_embeds) + + ip_adapter_image_embeds = [] + for i, single_image_embeds in enumerate(image_embeds): + single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0) + if do_classifier_free_guidance: + single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0) + + single_image_embeds = single_image_embeds.to(device=device) + ip_adapter_image_embeds.append(single_image_embeds) + + return ip_adapter_image_embeds + + # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents + def decode_latents(self, latents): + latents = 1 / self.vae.config.scaling_factor * latents + + batch_size, channels, num_frames, height, width = latents.shape + latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width) + + image = self.vae.decode(latents).sample + video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + video = video.float() + return video + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + height, + width, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, + callback_on_step_end_tensor_inputs=None, + image=None, + controlnet_conditioning_scale: float = 1.0, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + + is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance( + self.controlnet, torch._dynamo.eval_frame.OptimizedModule + ) + + # check `image` + if ( + isinstance(self.controlnet, SparseControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, SparseControlNetModel) + ): + if isinstance(image, list): + for image_ in image: + self.check_image(image_, prompt, prompt_embeds) + else: + self.check_image(image, prompt, prompt_embeds) + else: + assert False + + # Check `controlnet_conditioning_scale` + if ( + isinstance(self.controlnet, SparseControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, SparseControlNetModel) + ): + if not isinstance(controlnet_conditioning_scale, float): + raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.") + else: + assert False + + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image + def check_image(self, image, prompt, prompt_embeds): + image_is_pil = isinstance(image, PIL.Image.Image) + image_is_tensor = isinstance(image, torch.Tensor) + image_is_np = isinstance(image, np.ndarray) + image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image) + image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor) + image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray) + + if ( + not image_is_pil + and not image_is_tensor + and not image_is_np + and not image_is_pil_list + and not image_is_tensor_list + and not image_is_np_list + ): + raise TypeError( + f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}" + ) + + if image_is_pil: + image_batch_size = 1 + else: + image_batch_size = len(image) + + if prompt is not None and isinstance(prompt, str): + prompt_batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + prompt_batch_size = len(prompt) + elif prompt_embeds is not None: + prompt_batch_size = prompt_embeds.shape[0] + + if image_batch_size != 1 and image_batch_size != prompt_batch_size: + raise ValueError( + f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}" + ) + + # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents + def prepare_latents( + self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None + ): + shape = ( + batch_size, + num_channels_latents, + num_frames, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + def prepare_image(self, image, width, height, device, dtype): + image = self.control_image_processor.preprocess(image, height=height, width=width) + controlnet_images = image.unsqueeze(0).to(device, dtype) + batch_size, num_frames, channels, height, width = controlnet_images.shape + + # TODO: remove below line + assert controlnet_images.min() >= 0 and controlnet_images.max() <= 1 + + if self.controlnet.use_simplified_condition_embedding: + controlnet_images = controlnet_images.reshape(batch_size * num_frames, channels, height, width) + controlnet_images = 2 * controlnet_images - 1 + conditioning_frames = retrieve_latents(self.vae.encode(controlnet_images)) * self.vae.config.scaling_factor + conditioning_frames = conditioning_frames.reshape( + batch_size, num_frames, 4, height // self.vae_scale_factor, width // self.vae_scale_factor + ) + else: + conditioning_frames = controlnet_images + + conditioning_frames = conditioning_frames.permute(0, 2, 1, 3, 4) # [b, c, f, h, w] + return conditioning_frames + + def prepare_sparse_control_conditioning( + self, + conditioning_frames: torch.Tensor, + num_frames: int, + controlnet_frame_indices: int, + device: torch.device, + dtype: torch.dtype, + ) -> Tuple[torch.Tensor, torch.Tensor]: + assert conditioning_frames.shape[2] >= len(controlnet_frame_indices) + + batch_size, channels, _, height, width = conditioning_frames.shape + controlnet_cond = torch.zeros((batch_size, channels, num_frames, height, width), dtype=dtype, device=device) + controlnet_cond_mask = torch.zeros((batch_size, 1, num_frames, height, width), dtype=dtype, device=device) + controlnet_cond[:, :, controlnet_frame_indices] = conditioning_frames[:, :, : len(controlnet_frame_indices)] + controlnet_cond_mask[:, :, controlnet_frame_indices] = 1 + + return controlnet_cond, controlnet_cond_mask + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Optional[Union[str, List[str]]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_frames: int = 16, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_videos_per_prompt: int = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, + conditioning_frames: Optional[List[PipelineImageInput]] = None, + output_type: str = "pil", + return_dict: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_conditioning_scale: Union[float, List[float]] = 1.0, + controlnet_frame_indices: List[int] = [0], + guess_mode: bool = False, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + ): + r""" + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated video. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated video. + num_frames (`int`, *optional*, defaults to 16): + The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds + amounts to 2 seconds of video. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality videos at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.Tensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. Latents should be of shape + `(batch_size, num_channel, num_frames, height, width)`. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): + Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. + conditioning_frames (`List[PipelineImageInput]`, *optional*): + The SparseControlNet input to provide guidance to the `unet` for generation. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead + of a plain tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0): + The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added + to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set + the corresponding scale as a list. + controlnet_frame_indices (`List[int]`): + The indices where the conditioning frames must be applied for generation. Multiple frames can be + provided to guide the model to generate similar structure outputs, where the `unet` can + "fill-in-the-gaps" for interpolation videos, or a single frame could be provided for general expected + structure. Must have the same length as `conditioning_frames`. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + + Examples: + + Returns: + [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is + returned, otherwise a `tuple` is returned where the first element is a list with the generated frames. + """ + controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet + + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + num_videos_per_prompt = 1 + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt=prompt, + height=height, + width=width, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ip_adapter_image=ip_adapter_image, + ip_adapter_image_embeds=ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs, + image=conditioning_frames, + controlnet_conditioning_scale=controlnet_conditioning_scale, + ) + + self._guidance_scale = guidance_scale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + global_pool_conditions = ( + controlnet.config.global_pool_conditions + if isinstance(controlnet, SparseControlNetModel) + else controlnet.nets[0].config.global_pool_conditions + ) + guess_mode = guess_mode or global_pool_conditions + + # 3. Encode input prompt + text_encoder_lora_scale = ( + self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None + ) + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + device, + num_videos_per_prompt, + self.do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + clip_skip=self.clip_skip, + ) + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + if self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + # 4. Prepare IP-Adapter embeddings + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_videos_per_prompt, + self.do_classifier_free_guidance, + ) + + # 5. Prepare controlnet conditioning + conditioning_frames = self.prepare_image(conditioning_frames, width, height, device, controlnet.dtype) + controlnet_cond, controlnet_cond_mask = self.prepare_sparse_control_conditioning( + conditioning_frames, num_frames, controlnet_frame_indices, device, controlnet.dtype + ) + + # 6. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 7. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_videos_per_prompt, + num_channels_latents, + num_frames, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 9. Add image embeds for IP-Adapter + added_cond_kwargs = ( + {"image_embeds": image_embeds} + if ip_adapter_image is not None or ip_adapter_image_embeds is not None + else None + ) + + num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1 + for free_init_iter in range(num_free_init_iters): + if self.free_init_enabled: + latents, timesteps = self._apply_free_init( + latents, free_init_iter, num_inference_steps, device, latents.dtype, generator + ) + + self._num_timesteps = len(timesteps) + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + + # 10. Denoising loop + with self.progress_bar(total=self._num_timesteps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + if guess_mode and self.do_classifier_free_guidance: + # Infer SparseControlNetModel only for the conditional batch. + control_model_input = latents + control_model_input = self.scheduler.scale_model_input(control_model_input, t) + controlnet_prompt_embeds = prompt_embeds.chunk(2)[1] + else: + control_model_input = latent_model_input + controlnet_prompt_embeds = prompt_embeds + + down_block_res_samples, mid_block_res_sample = self.controlnet( + control_model_input, + t, + encoder_hidden_states=controlnet_prompt_embeds, + controlnet_cond=controlnet_cond, + conditioning_mask=controlnet_cond_mask, + conditioning_scale=controlnet_conditioning_scale, + guess_mode=guess_mode, + return_dict=False, + ) + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, + down_block_additional_residuals=down_block_res_samples, + mid_block_additional_residual=mid_block_res_sample, + ).sample + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + + # 11. Post processing + if output_type == "latent": + video = latents + else: + video_tensor = self.decode_latents(latents) + video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type) + + # 12. Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (video,) + + return AnimateDiffPipelineOutput(frames=video) diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py index b1501e12f551..69ba779de479 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py @@ -19,7 +19,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import PipelineImageInput -from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel from ...models.lora import adjust_lora_scale_text_encoder from ...models.unets.unet_motion_model import MotionAdapter @@ -174,7 +174,7 @@ class AnimateDiffVideoToVideoPipeline( StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, FreeInitMixin, ): r""" @@ -185,8 +185,8 @@ class AnimateDiffVideoToVideoPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: @@ -288,7 +288,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -421,7 +421,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index 49440830d0c0..b45771d7de74 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -286,6 +286,7 @@ def generate_language_model( The sequence of generated hidden-states. """ max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens + model_kwargs = self.language_model._get_initial_cache_position(inputs_embeds, model_kwargs) for _ in range(max_new_tokens): # prepare model inputs model_inputs = prepare_inputs_for_generation(inputs_embeds, **model_kwargs) diff --git a/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py b/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py index 3111ae075ccc..511d29215597 100644 --- a/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py +++ b/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py @@ -269,7 +269,6 @@ def encode_prompt( padding="max_length", return_tensors="pt", ) - text_inputs = {k: v.to(device) for k, v in text_inputs.items()} text_input_ids = text_inputs["input_ids"] untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids @@ -282,6 +281,7 @@ def encode_prompt( f" {max_length} tokens: {removed_text}" ) + text_inputs = {k: v.to(device) for k, v in text_inputs.items()} prompt_embeds = self.text_encoder(**text_inputs)[0] prompt_attention_mask = text_inputs["attention_mask"].unsqueeze(-1).expand(prompt_embeds.shape) prompt_embeds = prompt_embeds * prompt_attention_mask @@ -400,8 +400,8 @@ def __call__( sigmas: List[float] = None, guidance_scale: float = 3.5, num_images_per_prompt: Optional[int] = 1, - height: Optional[int] = 512, - width: Optional[int] = 512, + height: Optional[int] = 1024, + width: Optional[int] = 1024, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, latents: Optional[torch.Tensor] = None, prompt_embeds: Optional[torch.Tensor] = None, @@ -428,9 +428,9 @@ def __call__( `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): - The height in pixels of the generated image. This is set to 512 by default. + The height in pixels of the generated image. This is set to 1024 by default for best results. width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): - The width in pixels of the generated image. This is set to 512 by default. + The width in pixels of the generated image. This is set to 1024 by default for best results. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py index be714a657c10..2df09f62c880 100644 --- a/src/diffusers/pipelines/auto_pipeline.py +++ b/src/diffusers/pipelines/auto_pipeline.py @@ -263,9 +263,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -340,7 +338,6 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) local_files_only = kwargs.pop("local_files_only", False) @@ -349,7 +346,6 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): load_config_kwargs = { "cache_dir": cache_dir, "force_download": force_download, - "resume_download": resume_download, "proxies": proxies, "token": token, "local_files_only": local_files_only, @@ -554,9 +550,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -631,7 +625,6 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) local_files_only = kwargs.pop("local_files_only", False) @@ -640,7 +633,6 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): load_config_kwargs = { "cache_dir": cache_dir, "force_download": force_download, - "resume_download": resume_download, "proxies": proxies, "token": token, "local_files_only": local_files_only, @@ -850,9 +842,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -927,7 +917,6 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) local_files_only = kwargs.pop("local_files_only", False) @@ -936,7 +925,6 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): load_config_kwargs = { "cache_dir": cache_dir, "force_download": force_download, - "resume_download": resume_download, "proxies": proxies, "token": token, "local_files_only": local_files_only, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 7e03e8e1f083..0bde37244f5a 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -24,7 +24,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -156,7 +156,7 @@ class StableDiffusionControlNetPipeline( DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin, ): @@ -168,8 +168,8 @@ class StableDiffusionControlNetPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -331,7 +331,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -464,7 +464,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 477a1d0d17c0..f4c9881b4acf 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -23,7 +23,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -134,7 +134,7 @@ class StableDiffusionControlNetImg2ImgPipeline( DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin, ): @@ -146,8 +146,8 @@ class StableDiffusionControlNetImg2ImgPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -309,7 +309,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -442,7 +442,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) @@ -824,6 +824,13 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt ) elif isinstance(generator, list): + if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: + image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) + elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " + ) + init_latents = [ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(batch_size) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 5a30f5cc6237..aa46f4e9b617 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -25,7 +25,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -122,7 +122,7 @@ class StableDiffusionControlNetInpaintPipeline( DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin, ): @@ -134,8 +134,8 @@ class StableDiffusionControlNetInpaintPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -311,7 +311,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -444,7 +444,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index b5c8589d9b5c..8acd290c2671 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -930,6 +930,13 @@ def prepare_latents( ) elif isinstance(generator, list): + if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: + image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) + elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " + ) + init_latents = [ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(batch_size) diff --git a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py index 5957e57b492a..4cb3cf783be2 100644 --- a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +++ b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py @@ -30,9 +30,12 @@ from ...models.transformers import SD3Transformer2DModel from ...schedulers import FlowMatchEulerDiscreteScheduler from ...utils import ( + USE_PEFT_BACKEND, is_torch_xla_available, logging, replace_example_docstring, + scale_lora_layers, + unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor from ..pipeline_utils import DiffusionPipeline @@ -346,6 +349,7 @@ def encode_prompt( negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, clip_skip: Optional[int] = None, max_sequence_length: int = 256, + lora_scale: Optional[float] = None, ): r""" @@ -391,9 +395,22 @@ def encode_prompt( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ device = device or self._execution_device + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if self.text_encoder is not None and USE_PEFT_BACKEND: + scale_lora_layers(self.text_encoder, lora_scale) + if self.text_encoder_2 is not None and USE_PEFT_BACKEND: + scale_lora_layers(self.text_encoder_2, lora_scale) + prompt = [prompt] if isinstance(prompt, str) else prompt if prompt is not None: batch_size = len(prompt) @@ -496,6 +513,16 @@ def encode_prompt( [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1 ) + if self.text_encoder is not None: + if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder, lora_scale) + + if self.text_encoder_2 is not None: + if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder_2, lora_scale) + return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds def check_inputs( diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py index b01e09b38141..6aa6bd44bdf4 100644 --- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py @@ -23,7 +23,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -90,7 +90,11 @@ class StableDiffusionControlNetXSPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + StableDiffusionLoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for text-to-image generation using Stable Diffusion with ControlNet-XS guidance. @@ -100,8 +104,8 @@ class StableDiffusionControlNetXSPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files Args: @@ -258,7 +262,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -391,7 +395,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py index 1d438bcf877d..f545b24bec5c 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py @@ -7,7 +7,7 @@ import torch from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer -from ...loaders import LoraLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler from ...utils import ( @@ -84,7 +84,7 @@ """ -class IFPipeline(DiffusionPipeline, LoraLoaderMixin): +class IFPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin): tokenizer: T5Tokenizer text_encoder: T5EncoderModel diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py index c5d9eed3ca9c..07017912575d 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py @@ -9,7 +9,7 @@ import torch from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer -from ...loaders import LoraLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler from ...utils import ( @@ -108,7 +108,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: """ -class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): +class IFImg2ImgPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin): tokenizer: T5Tokenizer text_encoder: T5EncoderModel diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py index cb7e9ef6f347..6685ba6d774a 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py @@ -10,7 +10,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer -from ...loaders import LoraLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler from ...utils import ( @@ -111,7 +111,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: """ -class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): +class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin): tokenizer: T5Tokenizer text_encoder: T5EncoderModel diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py index cb592aa5675e..7fca0bc0443c 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py @@ -9,7 +9,7 @@ import torch from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer -from ...loaders import LoraLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler from ...utils import ( @@ -111,7 +111,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: """ -class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): +class IFInpaintingPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin): tokenizer: T5Tokenizer text_encoder: T5EncoderModel diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py index aa70eb7b40fe..4f04a1de2a6e 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py @@ -10,7 +10,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer -from ...loaders import LoraLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler from ...utils import ( @@ -113,7 +113,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: """ -class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): +class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin): tokenizer: T5Tokenizer text_encoder: T5EncoderModel diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py index fd38a8724308..891963f2a904 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py @@ -10,7 +10,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer -from ...loaders import LoraLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler from ...utils import ( @@ -69,7 +69,7 @@ """ -class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): +class IFSuperResolutionPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin): tokenizer: T5Tokenizer text_encoder: T5EncoderModel diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py index 6982568d042d..0d4ca3799a53 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py @@ -21,7 +21,12 @@ from ....configuration_utils import FrozenDict from ....image_processor import PipelineImageInput, VaeImageProcessor -from ....loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ....loaders import ( + FromSingleFileMixin, + IPAdapterMixin, + StableDiffusionLoraLoaderMixin, + TextualInversionLoaderMixin, +) from ....models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ....models.lora import adjust_lora_scale_text_encoder from ....schedulers import KarrasDiffusionSchedulers @@ -137,7 +142,7 @@ class AltDiffusionPipeline( DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin, ): @@ -149,8 +154,8 @@ class AltDiffusionPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -346,7 +351,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -478,7 +483,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py index f5de72a0f268..54823937f58e 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -23,7 +23,12 @@ from ....configuration_utils import FrozenDict from ....image_processor import PipelineImageInput, VaeImageProcessor -from ....loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ....loaders import ( + FromSingleFileMixin, + IPAdapterMixin, + StableDiffusionLoraLoaderMixin, + TextualInversionLoaderMixin, +) from ....models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ....models.lora import adjust_lora_scale_text_encoder from ....schedulers import KarrasDiffusionSchedulers @@ -178,7 +183,7 @@ class AltDiffusionImg2ImgPipeline( StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, FromSingleFileMixin, ): r""" @@ -189,8 +194,8 @@ class AltDiffusionImg2ImgPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -386,7 +391,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -518,7 +523,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py index 4977b183b59a..777be883cb9d 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py @@ -23,7 +23,7 @@ from ....configuration_utils import FrozenDict from ....image_processor import PipelineImageInput, VaeImageProcessor -from ....loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ....loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ....models import AutoencoderKL, UNet2DConditionModel from ....models.lora import adjust_lora_scale_text_encoder from ....schedulers import DDIMScheduler @@ -136,7 +136,7 @@ def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta): return noise -class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin): r""" Pipeline for text-guided image to image generation using Stable Diffusion. @@ -145,8 +145,8 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights Args: vae ([`AutoencoderKL`]): @@ -324,7 +324,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -457,7 +457,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py index c4e06039dccc..ce7ad3b0dfe9 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py @@ -23,7 +23,7 @@ from ....configuration_utils import FrozenDict from ....image_processor import VaeImageProcessor -from ....loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ....loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ....models import AutoencoderKL, UNet2DConditionModel from ....models.lora import adjust_lora_scale_text_encoder from ....schedulers import KarrasDiffusionSchedulers @@ -79,7 +79,7 @@ def preprocess_mask(mask, batch_size, scale_factor=8): class StableDiffusionInpaintPipelineLegacy( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, FromSingleFileMixin ): r""" Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*. @@ -89,11 +89,11 @@ class StableDiffusionInpaintPipelineLegacy( In addition the pipeline inherits the following loading methods: - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] - - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`] as well as the following saving methods: - - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + - *LoRA*: [`loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] Args: vae ([`AutoencoderKL`]): @@ -294,7 +294,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -427,7 +427,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py index a9b95d49dc23..701e7a3a81b2 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py @@ -19,7 +19,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ....image_processor import VaeImageProcessor -from ....loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ....loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ....models import AutoencoderKL, UNet2DConditionModel from ....models.lora import adjust_lora_scale_text_encoder from ....schedulers import PNDMScheduler @@ -37,7 +37,7 @@ class StableDiffusionModelEditingPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin ): r""" Pipeline for text-to-image model editing. @@ -47,8 +47,8 @@ class StableDiffusionModelEditingPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights Args: vae ([`AutoencoderKL`]): @@ -232,7 +232,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -365,7 +365,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py index 473598a53133..be21900ab55a 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py @@ -19,7 +19,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ....image_processor import VaeImageProcessor -from ....loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ....loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ....models import AutoencoderKL, UNet2DConditionModel from ....models.lora import adjust_lora_scale_text_encoder from ....schedulers import KarrasDiffusionSchedulers @@ -63,7 +63,11 @@ class StableDiffusionParadigmsPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + StableDiffusionLoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for text-to-image generation using a parallelized version of Stable Diffusion. @@ -73,8 +77,8 @@ class StableDiffusionParadigmsPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files Args: @@ -223,7 +227,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -356,7 +360,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py index 738239fcd194..2978972200c7 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py @@ -29,7 +29,7 @@ ) from ....image_processor import PipelineImageInput, VaeImageProcessor -from ....loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ....loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ....models import AutoencoderKL, UNet2DConditionModel from ....models.attention_processor import Attention from ....models.lora import adjust_lora_scale_text_encoder @@ -446,7 +446,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -579,7 +579,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/free_init_utils.py b/src/diffusers/pipelines/free_init_utils.py index 4f7965a038c5..1fb67592ca4f 100644 --- a/src/diffusers/pipelines/free_init_utils.py +++ b/src/diffusers/pipelines/free_init_utils.py @@ -180,6 +180,8 @@ def _apply_free_init( num_inference_steps = max( 1, int(num_inference_steps / self._free_init_num_iters * (free_init_iteration + 1)) ) + + if num_inference_steps > 0: self.scheduler.set_timesteps(num_inference_steps, device=device) return latents, self.scheduler.timesteps diff --git a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py index 1219797a5c94..0ff63bad5782 100644 --- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py @@ -3,7 +3,7 @@ import torch from transformers import T5EncoderModel, T5Tokenizer -from ...loaders import LoraLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin from ...models import Kandinsky3UNet, VQModel from ...schedulers import DDPMScheduler from ...utils import ( @@ -47,7 +47,7 @@ def downscale_height_and_width(height, width, scale_factor=8): return new_height * scale_factor, new_width * scale_factor -class Kandinsky3Pipeline(DiffusionPipeline, LoraLoaderMixin): +class Kandinsky3Pipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin): model_cpu_offload_seq = "text_encoder->unet->movq" _callback_tensor_inputs = [ "latents", diff --git a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py index 457ffb8fd5be..39313f867221 100644 --- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py @@ -7,7 +7,7 @@ import torch from transformers import T5EncoderModel, T5Tokenizer -from ...loaders import LoraLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin from ...models import Kandinsky3UNet, VQModel from ...schedulers import DDPMScheduler from ...utils import ( @@ -62,7 +62,7 @@ def prepare_image(pil_image): return image -class Kandinsky3Img2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): +class Kandinsky3Img2ImgPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin): model_cpu_offload_seq = "text_encoder->movq->unet->movq" _callback_tensor_inputs = [ "latents", diff --git a/src/diffusers/pipelines/kolors/pipeline_kolors.py b/src/diffusers/pipelines/kolors/pipeline_kolors.py index 97831a837f3e..6f25fd1ac0be 100644 --- a/src/diffusers/pipelines/kolors/pipeline_kolors.py +++ b/src/diffusers/pipelines/kolors/pipeline_kolors.py @@ -15,11 +15,12 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch +from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection from ...callbacks import MultiPipelineCallbacks, PipelineCallback -from ...image_processor import VaeImageProcessor -from ...loaders import StableDiffusionXLLoraLoaderMixin -from ...models import AutoencoderKL, UNet2DConditionModel +from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...loaders import IPAdapterMixin, StableDiffusionXLLoraLoaderMixin +from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor from ...schedulers import KarrasDiffusionSchedulers from ...utils import is_torch_xla_available, logging, replace_example_docstring @@ -120,7 +121,7 @@ def retrieve_timesteps( return timesteps, num_inference_steps -class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin): +class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, IPAdapterMixin): r""" Pipeline for text-to-image generation using Kolors. @@ -130,6 +131,7 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL The pipeline also inherits the following loading methods: - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: vae ([`AutoencoderKL`]): @@ -148,7 +150,11 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL `Kwai-Kolors/Kolors-diffusers`. """ - model_cpu_offload_seq = "text_encoder->unet->vae" + model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae" + _optional_components = [ + "image_encoder", + "feature_extractor", + ] _callback_tensor_inputs = [ "latents", "prompt_embeds", @@ -166,11 +172,21 @@ def __init__( tokenizer: ChatGLMTokenizer, unet: UNet2DConditionModel, scheduler: KarrasDiffusionSchedulers, + image_encoder: CLIPVisionModelWithProjection = None, + feature_extractor: CLIPImageProcessor = None, force_zeros_for_empty_prompt: bool = False, ): super().__init__() - self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler) + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + image_encoder=image_encoder, + feature_extractor=feature_extractor, + ) self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) self.vae_scale_factor = ( 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8 @@ -343,6 +359,77 @@ def encode_prompt( return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance + ): + image_embeds = [] + if do_classifier_free_guidance: + negative_image_embeds = [] + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] + + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) + + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + + image_embeds.append(single_image_embeds[None, :]) + if do_classifier_free_guidance: + negative_image_embeds.append(single_negative_image_embeds[None, :]) + else: + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + negative_image_embeds.append(single_negative_image_embeds) + image_embeds.append(single_image_embeds) + + ip_adapter_image_embeds = [] + for i, single_image_embeds in enumerate(image_embeds): + single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0) + if do_classifier_free_guidance: + single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0) + + single_image_embeds = single_image_embeds.to(device=device) + ip_adapter_image_embeds.append(single_image_embeds) + + return ip_adapter_image_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature @@ -364,6 +451,7 @@ def prepare_extra_step_kwargs(self, generator, eta): def check_inputs( self, prompt, + num_inference_steps, height, width, negative_prompt=None, @@ -371,9 +459,17 @@ def check_inputs( pooled_prompt_embeds=None, negative_prompt_embeds=None, negative_pooled_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, max_sequence_length=None, ): + if not isinstance(num_inference_steps, int) or num_inference_steps <= 0: + raise ValueError( + f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type" + f" {type(num_inference_steps)}." + ) + if height % 8 != 0 or width % 8 != 0: raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") @@ -420,6 +516,21 @@ def check_inputs( "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." ) + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + if max_sequence_length is not None and max_sequence_length > 256: raise ValueError(f"`max_sequence_length` cannot be greater than 256 but is {max_sequence_length}") @@ -563,6 +674,8 @@ def __call__( pooled_prompt_embeds: Optional[torch.Tensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None, negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -649,6 +762,12 @@ def __call__( Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. @@ -719,6 +838,7 @@ def __call__( # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, + num_inference_steps, height, width, negative_prompt, @@ -726,6 +846,8 @@ def __call__( pooled_prompt_embeds, negative_prompt_embeds, negative_pooled_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, callback_on_step_end_tensor_inputs, max_sequence_length=max_sequence_length, ) @@ -815,6 +937,15 @@ def __call__( add_text_embeds = add_text_embeds.to(device) add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1) + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) + # 8. Denoising loop num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) @@ -856,6 +987,9 @@ def __call__( # predict the noise residual added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + added_cond_kwargs["image_embeds"] = image_embeds + noise_pred = self.unet( latent_model_input, t, diff --git a/src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py b/src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py index 7eb15a1e8eee..a3324d34ae33 100644 --- a/src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py +++ b/src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py @@ -16,11 +16,12 @@ import PIL.Image import torch +from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import StableDiffusionXLLoraLoaderMixin -from ...models import AutoencoderKL, UNet2DConditionModel +from ...loaders import IPAdapterMixin, StableDiffusionXLLoraLoaderMixin +from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor from ...schedulers import KarrasDiffusionSchedulers from ...utils import is_torch_xla_available, logging, replace_example_docstring @@ -139,7 +140,7 @@ def retrieve_timesteps( return timesteps, num_inference_steps -class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin): +class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, IPAdapterMixin): r""" Pipeline for text-to-image generation using Kolors. @@ -149,6 +150,7 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu The pipeline also inherits the following loading methods: - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: vae ([`AutoencoderKL`]): @@ -167,10 +169,10 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu `Kwai-Kolors/Kolors-diffusers`. """ - model_cpu_offload_seq = "text_encoder->unet->vae" + model_cpu_offload_seq = "text_encoder->image_encoder-unet->vae" _optional_components = [ - "tokenizer", - "text_encoder", + "image_encoder", + "feature_extractor", ] _callback_tensor_inputs = [ "latents", @@ -189,11 +191,21 @@ def __init__( tokenizer: ChatGLMTokenizer, unet: UNet2DConditionModel, scheduler: KarrasDiffusionSchedulers, + image_encoder: CLIPVisionModelWithProjection = None, + feature_extractor: CLIPImageProcessor = None, force_zeros_for_empty_prompt: bool = False, ): super().__init__() - self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler) + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + image_encoder=image_encoder, + feature_extractor=feature_extractor, + ) self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) self.vae_scale_factor = ( 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8 @@ -367,6 +379,77 @@ def encode_prompt( return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance + ): + image_embeds = [] + if do_classifier_free_guidance: + negative_image_embeds = [] + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] + + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) + + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + + image_embeds.append(single_image_embeds[None, :]) + if do_classifier_free_guidance: + negative_image_embeds.append(single_negative_image_embeds[None, :]) + else: + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + negative_image_embeds.append(single_negative_image_embeds) + image_embeds.append(single_image_embeds) + + ip_adapter_image_embeds = [] + for i, single_image_embeds in enumerate(image_embeds): + single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0) + if do_classifier_free_guidance: + single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0) + + single_image_embeds = single_image_embeds.to(device=device) + ip_adapter_image_embeds.append(single_image_embeds) + + return ip_adapter_image_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature @@ -389,6 +472,7 @@ def check_inputs( self, prompt, strength, + num_inference_steps, height, width, negative_prompt=None, @@ -396,12 +480,20 @@ def check_inputs( pooled_prompt_embeds=None, negative_prompt_embeds=None, negative_pooled_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, max_sequence_length=None, ): if strength < 0 or strength > 1: raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") + if not isinstance(num_inference_steps, int) or num_inference_steps <= 0: + raise ValueError( + f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type" + f" {type(num_inference_steps)}." + ) + if height % 8 != 0 or width % 8 != 0: raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") @@ -448,6 +540,21 @@ def check_inputs( "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." ) + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + if max_sequence_length is not None and max_sequence_length > 256: raise ValueError(f"`max_sequence_length` cannot be greater than 256 but is {max_sequence_length}") @@ -528,6 +635,13 @@ def prepare_latents( ) elif isinstance(generator, list): + if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: + image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) + elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " + ) + init_latents = [ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(batch_size) @@ -692,6 +806,8 @@ def __call__( pooled_prompt_embeds: Optional[torch.Tensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None, negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -794,6 +910,12 @@ def __call__( Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. @@ -865,6 +987,7 @@ def __call__( self.check_inputs( prompt, strength, + num_inference_steps, height, width, negative_prompt, @@ -872,6 +995,8 @@ def __call__( pooled_prompt_embeds, negative_prompt_embeds, negative_pooled_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, callback_on_step_end_tensor_inputs, max_sequence_length=max_sequence_length, ) @@ -983,6 +1108,15 @@ def denoising_value_valid(dnv): add_text_embeds = add_text_embeds.to(device) add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1) + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) + # 9. Denoising loop num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) @@ -1030,6 +1164,9 @@ def denoising_value_valid(dnv): # predict the noise residual added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + added_cond_kwargs["image_embeds"] = image_embeds + noise_pred = self.unet( latent_model_input, t, diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index 4c40d2fd9e5b..dd72d3c9e10e 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -23,7 +23,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import LCMScheduler @@ -148,7 +148,7 @@ class LatentConsistencyModelImg2ImgPipeline( StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, FromSingleFileMixin, ): r""" @@ -159,8 +159,8 @@ class LatentConsistencyModelImg2ImgPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -273,7 +273,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -406,7 +406,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) @@ -520,6 +520,13 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt ) elif isinstance(generator, list): + if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: + image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) + elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " + ) + init_latents = [ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(batch_size) diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py index 4141a1daf2b4..89cafc2877fe 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py @@ -22,7 +22,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import LCMScheduler @@ -126,7 +126,7 @@ class LatentConsistencyModelPipeline( StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, FromSingleFileMixin, ): r""" @@ -137,8 +137,8 @@ class LatentConsistencyModelPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -257,7 +257,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -390,7 +390,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py index d64c636e2f30..537b509e3720 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py @@ -10,7 +10,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import Attention, AttnProcessor from ...models.lora import adjust_lora_scale_text_encoder @@ -248,7 +248,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): class LEditsPPPipelineStableDiffusion( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin + DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin ): """ Pipeline for textual image editing using LEDits++ with Stable Diffusion. @@ -538,7 +538,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -676,7 +676,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py index 032cc9b23715..6dc21c9d4538 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py @@ -24,7 +24,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -159,7 +159,7 @@ class StableDiffusionControlNetPAGPipeline( DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin, PAGMixin, @@ -172,8 +172,8 @@ class StableDiffusionControlNetPAGPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -305,7 +305,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -438,7 +438,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd.py b/src/diffusers/pipelines/pag/pipeline_pag_sd.py index bae0a599ab57..5bdbe8f82662 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd.py @@ -20,7 +20,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -137,7 +137,7 @@ class StableDiffusionPAGPipeline( DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin, PAGMixin, @@ -150,8 +150,8 @@ class StableDiffusionPAGPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -319,7 +319,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -452,7 +452,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py index 549202cfd6c5..7cb4a8d65a65 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py @@ -719,6 +719,13 @@ def prepare_latents( ) elif isinstance(generator, list): + if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: + image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) + elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " + ) + init_latents = [ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(batch_size) diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index d2baf861b271..f21d11ba918d 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -22,7 +22,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import PipelineImageInput -from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel from ...models.lora import adjust_lora_scale_text_encoder from ...models.unets.unet_motion_model import MotionAdapter @@ -128,7 +128,7 @@ class PIAPipeline( StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, FromSingleFileMixin, FreeInitMixin, ): @@ -140,8 +140,8 @@ class PIAPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: @@ -243,7 +243,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -376,7 +376,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/pipeline_flax_utils.py b/src/diffusers/pipelines/pipeline_flax_utils.py index 81fab6709125..c4c212873a88 100644 --- a/src/diffusers/pipelines/pipeline_flax_utils.py +++ b/src/diffusers/pipelines/pipeline_flax_utils.py @@ -254,9 +254,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -316,7 +314,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P ``` """ cache_dir = kwargs.pop("cache_dir", None) - resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) token = kwargs.pop("token", None) @@ -332,7 +329,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P config_dict = cls.load_config( pretrained_model_name_or_path, cache_dir=cache_dir, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -363,7 +359,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P cached_folder = snapshot_download( pretrained_model_name_or_path, cache_dir=cache_dir, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py index 0043bec65d79..a0af28803d79 100644 --- a/src/diffusers/pipelines/pipeline_loading_utils.py +++ b/src/diffusers/pipelines/pipeline_loading_utils.py @@ -435,7 +435,6 @@ def _load_empty_model( return_unused_kwargs=True, return_commit_hash=True, force_download=kwargs.pop("force_download", False), - resume_download=kwargs.pop("resume_download", None), proxies=kwargs.pop("proxies", None), local_files_only=kwargs.pop("local_files_only", False), token=kwargs.pop("token", None), @@ -454,7 +453,6 @@ def _load_empty_model( cached_folder, subfolder=name, force_download=kwargs.pop("force_download", False), - resume_download=kwargs.pop("resume_download", None), proxies=kwargs.pop("proxies", None), local_files_only=kwargs.pop("local_files_only", False), token=kwargs.pop("token", None), @@ -544,7 +542,6 @@ def _get_final_device_map(device_map, pipeline_class, passed_class_obj, init_dic torch_dtype=torch_dtype, cached_folder=kwargs.get("cached_folder", None), force_download=kwargs.get("force_download", None), - resume_download=kwargs.get("resume_download", None), proxies=kwargs.get("proxies", None), local_files_only=kwargs.get("local_files_only", None), token=kwargs.get("token", None), diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index e5f822caa0ef..2cc9defc3ffa 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -533,9 +533,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -625,7 +623,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P ``` """ cache_dir = kwargs.pop("cache_dir", None) - resume_download = kwargs.pop("resume_download", None) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) @@ -702,7 +699,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P cached_folder = cls.download( pretrained_model_name_or_path, cache_dir=cache_dir, - resume_download=resume_download, force_download=force_download, proxies=proxies, local_files_only=local_files_only, @@ -842,7 +838,6 @@ def load_module(name, value): torch_dtype=torch_dtype, cached_folder=cached_folder, force_download=force_download, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -910,7 +905,6 @@ def load_module(name, value): connected_pipes = {prefix: getattr(modelcard.data, prefix, [None])[0] for prefix in CONNECTED_PIPES_KEYS} load_kwargs = { "cache_dir": cache_dir, - "resume_download": resume_download, "force_download": force_download, "proxies": proxies, "local_files_only": local_files_only, @@ -1216,9 +1210,7 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -1271,7 +1263,6 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: """ cache_dir = kwargs.pop("cache_dir", None) - resume_download = kwargs.pop("resume_download", None) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) @@ -1311,7 +1302,6 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: revision=revision, proxies=proxies, force_download=force_download, - resume_download=resume_download, token=token, ) @@ -1500,7 +1490,6 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: cached_folder = snapshot_download( pretrained_model_name, cache_dir=cache_dir, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, @@ -1523,7 +1512,6 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: for connected_pipe_repo_id in connected_pipes: download_kwargs = { "cache_dir": cache_dir, - "resume_download": resume_download, "force_download": force_download, "proxies": proxies, "local_files_only": local_files_only, diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index b4b1b885dd3c..53dc98aea698 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -1370,6 +1370,8 @@ def download_from_original_stable_diffusion_ckpt( if "unet_config" in original_config["model"]["params"]: original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels + elif "network_config" in original_config["model"]["params"]: + original_config["model"]["params"]["network_config"]["params"]["in_channels"] = num_in_channels if ( "parameterization" in original_config["model"]["params"] diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 4264d5b89562..9254db065ebb 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -25,7 +25,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...configuration_utils import FrozenDict from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -137,7 +137,7 @@ class StableDiffusionPipeline( DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin, ): @@ -149,8 +149,8 @@ class StableDiffusionPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -347,7 +347,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -480,7 +480,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 780771279b8c..dddda6851e15 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -24,7 +24,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -74,7 +74,7 @@ def preprocess(image): return image -class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin): r""" Pipeline for text-guided depth-based image-to-image generation using Stable Diffusion. @@ -83,8 +83,8 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights Args: vae ([`AutoencoderKL`]): @@ -225,7 +225,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -358,7 +358,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) @@ -494,6 +494,13 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt ) elif isinstance(generator, list): + if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: + image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) + elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " + ) + init_latents = [ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(batch_size) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 073e5f308ccf..0b368ba2ac13 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -24,7 +24,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...configuration_utils import FrozenDict from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -175,7 +175,7 @@ class StableDiffusionImg2ImgPipeline( StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, FromSingleFileMixin, ): r""" @@ -186,8 +186,8 @@ class StableDiffusionImg2ImgPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -385,7 +385,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -518,7 +518,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) @@ -740,6 +740,13 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt ) elif isinstance(generator, list): + if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: + image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) + elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " + ) + init_latents = [ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(batch_size) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index cb035db4784e..21c6642b5e9f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -23,7 +23,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...configuration_utils import FrozenDict from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AsymmetricAutoencoderKL, AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -116,7 +116,7 @@ class StableDiffusionInpaintPipeline( StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, FromSingleFileMixin, ): r""" @@ -127,8 +127,8 @@ class StableDiffusionInpaintPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files @@ -335,7 +335,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -468,7 +468,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index 4b088556e349..95cc179b8896 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -22,7 +22,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import PIL_INTERPOLATION, deprecate, logging @@ -74,7 +74,11 @@ def retrieve_latents( class StableDiffusionInstructPix2PixPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + StableDiffusionLoraLoaderMixin, + IPAdapterMixin, ): r""" Pipeline for pixel-level image editing by following text instructions (based on Stable Diffusion). @@ -84,8 +88,8 @@ class StableDiffusionInstructPix2PixPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 4b6c2d6c23c2..4cbbe17531ef 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -22,7 +22,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import ( AttnProcessor2_0, @@ -66,7 +66,11 @@ def preprocess(image): class StableDiffusionUpscalePipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + StableDiffusionLoraLoaderMixin, + FromSingleFileMixin, ): r""" Pipeline for text-guided image super-resolution using Stable Diffusion 2. @@ -76,8 +80,8 @@ class StableDiffusionUpscalePipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files Args: @@ -243,7 +247,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -376,7 +380,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index 2ec689795181..41811f8f2c0e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -20,7 +20,7 @@ from transformers.models.clip.modeling_clip import CLIPTextModelOutput from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, PriorTransformer, UNet2DConditionModel from ...models.embeddings import get_timestep_embedding from ...models.lora import adjust_lora_scale_text_encoder @@ -58,7 +58,9 @@ """ -class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableUnCLIPPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin +): """ Pipeline for text-to-image generation using stable unCLIP. @@ -67,8 +69,8 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights Args: prior_tokenizer ([`CLIPTokenizer`]): @@ -326,7 +328,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -459,7 +461,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index 377fc17f2bb3..2556d5e57b6d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -20,7 +20,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.embeddings import get_timestep_embedding from ...models.lora import adjust_lora_scale_text_encoder @@ -70,7 +70,7 @@ class StableUnCLIPImg2ImgPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin ): """ Pipeline for text-guided image-to-image generation using stable unCLIP. @@ -80,8 +80,8 @@ class StableUnCLIPImg2ImgPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights Args: feature_extractor ([`CLIPImageProcessor`]): @@ -290,7 +290,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -423,7 +423,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py index f8c075294b73..9856e9e94ecc 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py @@ -29,9 +29,12 @@ from ...models.transformers import SD3Transformer2DModel from ...schedulers import FlowMatchEulerDiscreteScheduler from ...utils import ( + USE_PEFT_BACKEND, is_torch_xla_available, logging, replace_example_docstring, + scale_lora_layers, + unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor from ..pipeline_utils import DiffusionPipeline @@ -329,6 +332,7 @@ def encode_prompt( negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, clip_skip: Optional[int] = None, max_sequence_length: int = 256, + lora_scale: Optional[float] = None, ): r""" @@ -374,9 +378,22 @@ def encode_prompt( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ device = device or self._execution_device + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if self.text_encoder is not None and USE_PEFT_BACKEND: + scale_lora_layers(self.text_encoder, lora_scale) + if self.text_encoder_2 is not None and USE_PEFT_BACKEND: + scale_lora_layers(self.text_encoder_2, lora_scale) + prompt = [prompt] if isinstance(prompt, str) else prompt if prompt is not None: batch_size = len(prompt) @@ -479,6 +496,16 @@ def encode_prompt( [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1 ) + if self.text_encoder is not None: + if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder, lora_scale) + + if self.text_encoder_2 is not None: + if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder_2, lora_scale) + return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds def check_inputs( @@ -683,7 +710,7 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. - guidance_scale (`float`, *optional*, defaults to 5.0): + guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > @@ -787,6 +814,9 @@ def __call__( device = self._execution_device + lora_scale = ( + self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None + ) ( prompt_embeds, negative_prompt_embeds, @@ -808,6 +838,7 @@ def __call__( clip_skip=self.clip_skip, num_images_per_prompt=num_images_per_prompt, max_sequence_length=max_sequence_length, + lora_scale=lora_scale, ) if self.do_classifier_free_guidance: diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py index 1fa324cc912f..4fbb3c71cd5d 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py @@ -25,13 +25,17 @@ ) from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...loaders import SD3LoraLoaderMixin from ...models.autoencoders import AutoencoderKL from ...models.transformers import SD3Transformer2DModel from ...schedulers import FlowMatchEulerDiscreteScheduler from ...utils import ( + USE_PEFT_BACKEND, is_torch_xla_available, logging, replace_example_docstring, + scale_lora_layers, + unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor from ..pipeline_utils import DiffusionPipeline @@ -346,6 +350,7 @@ def encode_prompt( negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, clip_skip: Optional[int] = None, max_sequence_length: int = 256, + lora_scale: Optional[float] = None, ): r""" @@ -391,9 +396,22 @@ def encode_prompt( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ device = device or self._execution_device + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if self.text_encoder is not None and USE_PEFT_BACKEND: + scale_lora_layers(self.text_encoder, lora_scale) + if self.text_encoder_2 is not None and USE_PEFT_BACKEND: + scale_lora_layers(self.text_encoder_2, lora_scale) + prompt = [prompt] if isinstance(prompt, str) else prompt if prompt is not None: batch_size = len(prompt) @@ -496,6 +514,16 @@ def encode_prompt( [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1 ) + if self.text_encoder is not None: + if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder, lora_scale) + + if self.text_encoder_2 is not None: + if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder_2, lora_scale) + return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds def check_inputs( @@ -724,7 +752,7 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. - guidance_scale (`float`, *optional*, defaults to 5.0): + guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py index f340223cb38b..d5dedae16581 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py @@ -25,13 +25,17 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...loaders import SD3LoraLoaderMixin from ...models.autoencoders import AutoencoderKL from ...models.transformers import SD3Transformer2DModel from ...schedulers import FlowMatchEulerDiscreteScheduler from ...utils import ( + USE_PEFT_BACKEND, is_torch_xla_available, logging, replace_example_docstring, + scale_lora_layers, + unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor from ..pipeline_utils import DiffusionPipeline @@ -352,6 +356,7 @@ def encode_prompt( negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, clip_skip: Optional[int] = None, max_sequence_length: int = 256, + lora_scale: Optional[float] = None, ): r""" @@ -397,9 +402,22 @@ def encode_prompt( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ device = device or self._execution_device + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if self.text_encoder is not None and USE_PEFT_BACKEND: + scale_lora_layers(self.text_encoder, lora_scale) + if self.text_encoder_2 is not None and USE_PEFT_BACKEND: + scale_lora_layers(self.text_encoder_2, lora_scale) + prompt = [prompt] if isinstance(prompt, str) else prompt if prompt is not None: batch_size = len(prompt) @@ -502,6 +520,16 @@ def encode_prompt( [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1 ) + if self.text_encoder is not None: + if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder, lora_scale) + + if self.text_encoder_2 is not None: + if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder_2, lora_scale) + return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.check_inputs @@ -850,7 +878,7 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. - guidance_scale (`float`, *optional*, defaults to 5.0): + guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > diff --git a/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py index 65fd27bca28d..8f40fa72a25c 100644 --- a/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py @@ -22,7 +22,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import Attention from ...models.lora import adjust_lora_scale_text_encoder @@ -323,7 +323,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -456,7 +456,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py index 9f1ad9ecb6fd..2b86470dbff1 100644 --- a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +++ b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py @@ -24,7 +24,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import DDIMInverseScheduler, KarrasDiffusionSchedulers @@ -234,7 +234,7 @@ def preprocess_mask(mask, batch_size: int = 1): class StableDiffusionDiffEditPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin ): r""" @@ -250,8 +250,8 @@ class StableDiffusionDiffEditPipeline( The pipeline also inherits the following loading and saving methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights Args: vae ([`AutoencoderKL`]): @@ -448,7 +448,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -581,7 +581,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py index 590532947720..62584beec6a9 100644 --- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py @@ -21,7 +21,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention import GatedSelfAttentionDense from ...models.lora import adjust_lora_scale_text_encoder @@ -249,7 +249,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -382,7 +382,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py index 209e90db0188..67b9b927f210 100644 --- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py @@ -27,7 +27,7 @@ ) from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention import GatedSelfAttentionDense from ...models.lora import adjust_lora_scale_text_encoder @@ -274,7 +274,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -407,7 +407,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py index 53e03888cd82..1e396cb2329f 100755 --- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -21,7 +21,7 @@ from k_diffusion.sampling import BrownianTreeNoiseSampler, get_sigmas_karras from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import LMSDiscreteScheduler from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers @@ -48,7 +48,7 @@ def apply_model(self, *args, **kwargs): class StableDiffusionKDiffusionPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin ): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -58,8 +58,8 @@ class StableDiffusionKDiffusionPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights @@ -223,7 +223,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -356,7 +356,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py index 74a29c9b1f01..95532efc6d2c 100644 --- a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py @@ -22,7 +22,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import PipelineImageInput, VaeImageProcessorLDM3D -from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -161,7 +161,7 @@ class StableDiffusionLDM3DPipeline( StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, - LoraLoaderMixin, + StableDiffusionLoraLoaderMixin, FromSingleFileMixin, ): r""" @@ -172,8 +172,8 @@ class StableDiffusionLDM3DPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters @@ -323,7 +323,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -456,7 +456,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py index 9aee4827f920..f6fabeeca011 100644 --- a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py @@ -19,7 +19,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import DDIMScheduler @@ -135,7 +135,11 @@ def retrieve_timesteps( class StableDiffusionPanoramaPipeline( - DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + StableDiffusionLoraLoaderMixin, + IPAdapterMixin, ): r""" Pipeline for text-to-image generation using MultiDiffusion. @@ -145,8 +149,8 @@ class StableDiffusionPanoramaPipeline( The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters Args: @@ -295,7 +299,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -428,7 +432,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py index bee9df9d7130..c32052d2e4d0 100644 --- a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py @@ -20,7 +20,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -238,7 +238,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -371,7 +371,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index e968450076c5..8acc251f42d2 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -710,6 +710,13 @@ def prepare_latents( ) elif isinstance(generator, list): + if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: + image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) + elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " + ) + init_latents = [ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(batch_size) diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py index 4895d0484db8..55a8694c16e9 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py @@ -22,7 +22,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -340,7 +340,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -473,7 +473,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index 3c086702178b..cdd72b97f86b 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -18,7 +18,7 @@ import torch from transformers import CLIPTextModel, CLIPTokenizer -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet3DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -58,7 +58,9 @@ """ -class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin): +class TextToVideoSDPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin +): r""" Pipeline for text-to-video generation. @@ -67,8 +69,8 @@ class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInve The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights Args: vae ([`AutoencoderKL`]): @@ -183,7 +185,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -316,7 +318,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py index 2b27c7fcabf8..92bf1d388c13 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -19,7 +19,7 @@ import torch from transformers import CLIPTextModel, CLIPTokenizer -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet3DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -93,7 +93,9 @@ def retrieve_latents( raise AttributeError("Could not access latents of provided encoder_output") -class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin): +class VideoToVideoSDPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin +): r""" Pipeline for text-guided video-to-video generation. @@ -102,8 +104,8 @@ class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInv The pipeline also inherits the following loading methods: - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights Args: vae ([`AutoencoderKL`]): @@ -218,7 +220,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -351,7 +353,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index 337fa6aa8ae2..c95c7f1b9625 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -11,7 +11,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -281,7 +281,9 @@ def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_s return warped_latents -class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin): +class TextToVideoZeroPipeline( + DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin +): r""" Pipeline for zero-shot text-to-video generation using Stable Diffusion. @@ -831,7 +833,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -964,7 +966,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py index 16bcf3808db4..4f65caf4e610 100644 --- a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +++ b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py @@ -14,7 +14,7 @@ ) from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL from ...models.lora import adjust_lora_scale_text_encoder from ...schedulers import KarrasDiffusionSchedulers @@ -422,7 +422,7 @@ def encode_prompt( """ # set lora scale so that monkey patched LoRA # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): + if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin): self._lora_scale = lora_scale # dynamically adjust the LoRA scale @@ -555,7 +555,7 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) if self.text_encoder is not None: - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND: # Retrieve the original scale by scaling back the LoRA layers unscale_lora_layers(self.text_encoder, lora_scale) diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py index 150e9168cc5f..2e3e8f198d5f 100644 --- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py @@ -20,7 +20,7 @@ import torch from transformers import CLIPTextModel, CLIPTokenizer -from ...loaders import LoraLoaderMixin +from ...loaders import StableDiffusionLoraLoaderMixin from ...schedulers import DDPMWuerstchenScheduler from ...utils import BaseOutput, deprecate, logging, replace_example_docstring from ...utils.torch_utils import randn_tensor @@ -62,7 +62,7 @@ class WuerstchenPriorPipelineOutput(BaseOutput): image_embeddings: Union[torch.Tensor, np.ndarray] -class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin): +class WuerstchenPriorPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixin): """ Pipeline for generating image prior for Wuerstchen. @@ -70,8 +70,8 @@ class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin): library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) The pipeline also inherits the following loading methods: - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights Args: prior ([`Prior`]): @@ -95,6 +95,7 @@ class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin): text_encoder_name = "text_encoder" model_cpu_offload_seq = "text_encoder->prior" _callback_tensor_inputs = ["latents", "text_encoder_hidden_states", "negative_prompt_embeds"] + _lora_loadable_modules = ["prior", "text_encoder"] def __init__( self, diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py index 685765779e21..11073ce491d3 100644 --- a/src/diffusers/schedulers/scheduling_deis_multistep.py +++ b/src/diffusers/schedulers/scheduling_deis_multistep.py @@ -674,7 +674,7 @@ def _init_step_index(self, timestep): def step( self, model_output: torch.Tensor, - timestep: int, + timestep: Union[int, torch.Tensor], sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: @@ -685,7 +685,7 @@ def step( Args: model_output (`torch.Tensor`): The direct output from learned diffusion model. - timestep (`float`): + timestep (`int`): The current discrete timestep in the diffusion chain. sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py index 0f0e5296054f..4472a06c3428 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -920,7 +920,7 @@ def _init_step_index(self, timestep): def step( self, model_output: torch.Tensor, - timestep: int, + timestep: Union[int, torch.Tensor], sample: torch.Tensor, generator=None, variance_noise: Optional[torch.Tensor] = None, diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py index 4695941e725d..6628a92ba034 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py @@ -787,7 +787,7 @@ def _init_step_index(self, timestep): def step( self, model_output: torch.Tensor, - timestep: int, + timestep: Union[int, torch.Tensor], sample: torch.Tensor, generator=None, variance_noise: Optional[torch.Tensor] = None, diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index 15008eec0e04..1a10fff043fb 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -22,6 +22,7 @@ from ..configuration_utils import ConfigMixin, register_to_config from ..utils import deprecate, logging +from ..utils.torch_utils import randn_tensor from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput @@ -108,11 +109,11 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `algorithm_type="dpmsolver++"`. algorithm_type (`str`, defaults to `dpmsolver++`): - Algorithm type for the solver; can be `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the - algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927) paper, and the `dpmsolver++` type - implements the algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is - recommended to use `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided sampling like in - Stable Diffusion. + Algorithm type for the solver; can be `dpmsolver` or `dpmsolver++` or `sde-dpmsolver++`. The `dpmsolver` + type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927) paper, and the + `dpmsolver++` type implements the algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095) + paper. It is recommended to use `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided + sampling like in Stable Diffusion. solver_type (`str`, defaults to `midpoint`): Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers. @@ -186,7 +187,7 @@ def __init__( self.init_noise_sigma = 1.0 # settings for DPM-Solver - if algorithm_type not in ["dpmsolver", "dpmsolver++"]: + if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver++"]: if algorithm_type == "deis": self.register_to_config(algorithm_type="dpmsolver++") else: @@ -197,7 +198,7 @@ def __init__( else: raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}") - if algorithm_type != "dpmsolver++" and final_sigmas_type == "zero": + if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"] and final_sigmas_type == "zero": raise ValueError( f"`final_sigmas_type` {final_sigmas_type} is not supported for `algorithm_type` {algorithm_type}. Please chooose `sigma_min` instead." ) @@ -493,10 +494,10 @@ def convert_model_output( "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`", ) # DPM-Solver++ needs to solve an integral of the data prediction model. - if self.config.algorithm_type == "dpmsolver++": + if self.config.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]: if self.config.prediction_type == "epsilon": # DPM-Solver and DPM-Solver++ only need the "mean" output. - if self.config.variance_type in ["learned_range"]: + if self.config.variance_type in ["learned", "learned_range"]: model_output = model_output[:, :3] sigma = self.sigmas[self.step_index] alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma) @@ -517,34 +518,43 @@ def convert_model_output( x0_pred = self._threshold_sample(x0_pred) return x0_pred + # DPM-Solver needs to solve an integral of the noise prediction model. elif self.config.algorithm_type == "dpmsolver": if self.config.prediction_type == "epsilon": # DPM-Solver and DPM-Solver++ only need the "mean" output. - if self.config.variance_type in ["learned_range"]: - model_output = model_output[:, :3] - return model_output + if self.config.variance_type in ["learned", "learned_range"]: + epsilon = model_output[:, :3] + else: + epsilon = model_output elif self.config.prediction_type == "sample": sigma = self.sigmas[self.step_index] alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma) epsilon = (sample - alpha_t * model_output) / sigma_t - return epsilon elif self.config.prediction_type == "v_prediction": sigma = self.sigmas[self.step_index] alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma) epsilon = alpha_t * model_output + sigma_t * sample - return epsilon else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" " `v_prediction` for the DPMSolverSinglestepScheduler." ) + if self.config.thresholding: + alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] + x0_pred = (sample - sigma_t * epsilon) / alpha_t + x0_pred = self._threshold_sample(x0_pred) + epsilon = (sample - alpha_t * x0_pred) / sigma_t + + return epsilon + def dpm_solver_first_order_update( self, model_output: torch.Tensor, *args, sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, ) -> torch.Tensor: """ @@ -594,6 +604,13 @@ def dpm_solver_first_order_update( x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output elif self.config.algorithm_type == "dpmsolver": x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output + elif self.config.algorithm_type == "sde-dpmsolver++": + assert noise is not None + x_t = ( + (sigma_t / sigma_s * torch.exp(-h)) * sample + + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output + + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise + ) return x_t def singlestep_dpm_solver_second_order_update( @@ -601,6 +618,7 @@ def singlestep_dpm_solver_second_order_update( model_output_list: List[torch.Tensor], *args, sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, ) -> torch.Tensor: """ @@ -688,6 +706,22 @@ def singlestep_dpm_solver_second_order_update( - (sigma_t * (torch.exp(h) - 1.0)) * D0 - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1 ) + elif self.config.algorithm_type == "sde-dpmsolver++": + assert noise is not None + if self.config.solver_type == "midpoint": + x_t = ( + (sigma_t / sigma_s1 * torch.exp(-h)) * sample + + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0 + + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1 + + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise + ) + elif self.config.solver_type == "heun": + x_t = ( + (sigma_t / sigma_s1 * torch.exp(-h)) * sample + + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0 + + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1 + + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise + ) return x_t def singlestep_dpm_solver_third_order_update( @@ -800,6 +834,7 @@ def singlestep_dpm_solver_update( *args, sample: torch.Tensor = None, order: int = None, + noise: Optional[torch.Tensor] = None, **kwargs, ) -> torch.Tensor: """ @@ -848,9 +883,9 @@ def singlestep_dpm_solver_update( ) if order == 1: - return self.dpm_solver_first_order_update(model_output_list[-1], sample=sample) + return self.dpm_solver_first_order_update(model_output_list[-1], sample=sample, noise=noise) elif order == 2: - return self.singlestep_dpm_solver_second_order_update(model_output_list, sample=sample) + return self.singlestep_dpm_solver_second_order_update(model_output_list, sample=sample, noise=noise) elif order == 3: return self.singlestep_dpm_solver_third_order_update(model_output_list, sample=sample) else: @@ -892,8 +927,9 @@ def _init_step_index(self, timestep): def step( self, model_output: torch.Tensor, - timestep: int, + timestep: Union[int, torch.Tensor], sample: torch.Tensor, + generator=None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -929,6 +965,13 @@ def step( self.model_outputs[i] = self.model_outputs[i + 1] self.model_outputs[-1] = model_output + if self.config.algorithm_type == "sde-dpmsolver++": + noise = randn_tensor( + model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype + ) + else: + noise = None + order = self.order_list[self.step_index] # For img2img denoising might start with order>1 which is not possible @@ -940,9 +983,11 @@ def step( if order == 1: self.sample = sample - prev_sample = self.singlestep_dpm_solver_update(self.model_outputs, sample=self.sample, order=order) + prev_sample = self.singlestep_dpm_solver_update( + self.model_outputs, sample=self.sample, order=order, noise=noise + ) - # upon completion increase step index by one + # upon completion increase step index by one, noise=noise self._step_index += 1 if not return_dict: diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py index 6eef247bfdd4..21ae1df00a88 100644 --- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py @@ -594,7 +594,7 @@ def _init_step_index(self, timestep): def step( self, model_output: torch.Tensor, - timestep: int, + timestep: Union[int, torch.Tensor], sample: torch.Tensor, generator=None, return_dict: bool = True, diff --git a/src/diffusers/schedulers/scheduling_ipndm.py b/src/diffusers/schedulers/scheduling_ipndm.py index 9f15f1fe0aab..28f349ae2114 100644 --- a/src/diffusers/schedulers/scheduling_ipndm.py +++ b/src/diffusers/schedulers/scheduling_ipndm.py @@ -138,7 +138,7 @@ def _init_step_index(self, timestep): def step( self, model_output: torch.Tensor, - timestep: int, + timestep: Union[int, torch.Tensor], sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index 367761ce3fef..995f85c020ed 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -822,7 +822,7 @@ def _init_step_index(self, timestep): def step( self, model_output: torch.Tensor, - timestep: int, + timestep: Union[int, torch.Tensor], sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: diff --git a/src/diffusers/schedulers/scheduling_utils.py b/src/diffusers/schedulers/scheduling_utils.py index 33d34e26d89d..f20224b19009 100644 --- a/src/diffusers/schedulers/scheduling_utils.py +++ b/src/diffusers/schedulers/scheduling_utils.py @@ -121,9 +121,7 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. diff --git a/src/diffusers/schedulers/scheduling_utils_flax.py b/src/diffusers/schedulers/scheduling_utils_flax.py index 360ca4705e02..ae11baf9ea1b 100644 --- a/src/diffusers/schedulers/scheduling_utils_flax.py +++ b/src/diffusers/schedulers/scheduling_utils_flax.py @@ -102,9 +102,7 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index 1612ca5ae4c0..f41edfcda3d8 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -73,7 +73,6 @@ is_librosa_available, is_matplotlib_available, is_note_seq_available, - is_notebook, is_onnx_available, is_peft_available, is_peft_version, diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py index 5df0d6d28f53..3ead6fd99d10 100644 --- a/src/diffusers/utils/dummy_pt_objects.py +++ b/src/diffusers/utils/dummy_pt_objects.py @@ -362,6 +362,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) +class SparseControlNetModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class T2IAdapter(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 399656d8c185..8e40e5128854 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -107,6 +107,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class AnimateDiffSparseControlNetPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class AnimateDiffVideoToVideoPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/src/diffusers/utils/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py index 733579b8c09c..f0cf953924ad 100644 --- a/src/diffusers/utils/dynamic_modules_utils.py +++ b/src/diffusers/utils/dynamic_modules_utils.py @@ -199,7 +199,6 @@ def get_cached_module_file( module_file: str, cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, - resume_download: Optional[bool] = None, proxies: Optional[Dict[str, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, @@ -226,9 +225,7 @@ def get_cached_module_file( cache should not be used. force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they - exist. resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. + exist. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. @@ -309,7 +306,6 @@ def get_cached_module_file( cache_dir=cache_dir, force_download=force_download, proxies=proxies, - resume_download=resume_download, local_files_only=local_files_only, token=token, ) @@ -366,7 +362,6 @@ def get_cached_module_file( f"{module_needed}.py", cache_dir=cache_dir, force_download=force_download, - resume_download=resume_download, proxies=proxies, token=token, revision=revision, @@ -382,7 +377,6 @@ def get_class_from_dynamic_module( class_name: Optional[str] = None, cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, - resume_download: Optional[bool] = None, proxies: Optional[Dict[str, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, @@ -419,9 +413,6 @@ def get_class_from_dynamic_module( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they exist. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 of - Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. @@ -458,7 +449,6 @@ def get_class_from_dynamic_module( module_file, cache_dir=cache_dir, force_download=force_download, - resume_download=resume_download, proxies=proxies, token=token, revision=revision, diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py index 7ecb7de89cd3..020411dc7883 100644 --- a/src/diffusers/utils/hub_utils.py +++ b/src/diffusers/utils/hub_utils.py @@ -271,7 +271,8 @@ def move_cache(old_cache_dir: Optional[str] = None, new_cache_dir: Optional[str] def _add_variant(weights_name: str, variant: Optional[str] = None) -> str: if variant is not None: splits = weights_name.split(".") - splits = splits[:-1] + [variant] + splits[-1:] + split_index = -2 if weights_name.endswith(".index.json") else -1 + splits = splits[:-split_index] + [variant] + splits[-split_index:] weights_name = ".".join(splits) return weights_name @@ -286,7 +287,6 @@ def _get_model_file( cache_dir: Optional[str] = None, force_download: bool = False, proxies: Optional[Dict] = None, - resume_download: Optional[bool] = None, local_files_only: bool = False, token: Optional[str] = None, user_agent: Optional[Union[Dict, str]] = None, @@ -324,7 +324,6 @@ def _get_model_file( cache_dir=cache_dir, force_download=force_download, proxies=proxies, - resume_download=resume_download, local_files_only=local_files_only, token=token, user_agent=user_agent, @@ -349,7 +348,6 @@ def _get_model_file( cache_dir=cache_dir, force_download=force_download, proxies=proxies, - resume_download=resume_download, local_files_only=local_files_only, token=token, user_agent=user_agent, @@ -417,7 +415,6 @@ def _get_checkpoint_shard_files( index_filename, cache_dir=None, proxies=None, - resume_download=False, local_files_only=False, token=None, user_agent=None, @@ -475,7 +472,6 @@ def _get_checkpoint_shard_files( cached_folder = snapshot_download( pretrained_model_name_or_path, cache_dir=cache_dir, - resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, token=token, diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index 475d6e415eea..44477df2e220 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -321,18 +321,7 @@ def is_timm_available(): except importlib_metadata.PackageNotFoundError: _bitsandbytes_available = False -# Taken from `huggingface_hub`. -_is_notebook = False -try: - shell_class = get_ipython().__class__ # type: ignore # noqa: F821 - for parent_class in shell_class.__mro__: # e.g. "is subclass of" - if parent_class.__name__ == "ZMQInteractiveShell": - _is_notebook = True # Jupyter notebook, Google colab or qtconsole - break -except NameError: - pass # Probably standard Python interpreter - -_is_google_colab = "google.colab" in sys.modules +_is_google_colab = "google.colab" in sys.modules or any(k.startswith("COLAB_") for k in os.environ) def is_torch_available(): @@ -443,10 +432,6 @@ def is_bitsandbytes_available(): return _bitsandbytes_available -def is_notebook(): - return _is_notebook - - def is_google_colab(): return _is_google_colab diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py index 48d0b9d8a533..9ce559be7f06 100644 --- a/tests/lora/test_lora_layers_sd3.py +++ b/tests/lora/test_lora_layers_sd3.py @@ -12,376 +12,55 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os import sys -import tempfile import unittest -import numpy as np -import torch -from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel - from diffusers import ( - AutoencoderKL, FlowMatchEulerDiscreteScheduler, - SD3Transformer2DModel, StableDiffusion3Pipeline, ) from diffusers.utils.testing_utils import is_peft_available, require_peft_backend, require_torch_gpu, torch_device if is_peft_available(): - from peft import LoraConfig - from peft.utils import get_peft_model_state_dict + pass sys.path.append(".") -from utils import check_if_lora_correctly_set # noqa: E402 +from utils import PeftLoraLoaderMixinTests # noqa: E402 @require_peft_backend -class SD3LoRATests(unittest.TestCase): +class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = StableDiffusion3Pipeline - - def get_dummy_components(self): - torch.manual_seed(0) - transformer = SD3Transformer2DModel( - sample_size=32, - patch_size=1, - in_channels=4, - num_layers=1, - attention_head_dim=8, - num_attention_heads=4, - caption_projection_dim=32, - joint_attention_dim=32, - pooled_projection_dim=64, - out_channels=4, - ) - clip_text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - hidden_act="gelu", - projection_dim=32, - ) - - torch.manual_seed(0) - text_encoder = CLIPTextModelWithProjection(clip_text_encoder_config) - - torch.manual_seed(0) - text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config) - - text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5") - - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - tokenizer_3 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5") - - torch.manual_seed(0) - vae = AutoencoderKL( - sample_size=32, - in_channels=3, - out_channels=3, - block_out_channels=(4,), - layers_per_block=1, - latent_channels=4, - norm_num_groups=1, - use_quant_conv=False, - use_post_quant_conv=False, - shift_factor=0.0609, - scaling_factor=1.5035, - ) - - scheduler = FlowMatchEulerDiscreteScheduler() - - return { - "scheduler": scheduler, - "text_encoder": text_encoder, - "text_encoder_2": text_encoder_2, - "text_encoder_3": text_encoder_3, - "tokenizer": tokenizer, - "tokenizer_2": tokenizer_2, - "tokenizer_3": tokenizer_3, - "transformer": transformer, - "vae": vae, - } - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device="cpu").manual_seed(seed) - - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 5.0, - "output_type": "np", - } - return inputs - - def get_lora_config_for_transformer(self): - lora_config = LoraConfig( - r=4, - lora_alpha=4, - target_modules=["to_q", "to_k", "to_v", "to_out.0"], - init_lora_weights=False, - use_dora=False, - ) - return lora_config - - def get_lora_config_for_text_encoders(self): - text_lora_config = LoraConfig( - r=4, - lora_alpha=4, - init_lora_weights="gaussian", - target_modules=["q_proj", "k_proj", "v_proj", "out_proj"], - ) - return text_lora_config - - def test_simple_inference_with_transformer_lora_save_load(self): - components = self.get_dummy_components() - transformer_config = self.get_lora_config_for_transformer() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - pipe.transformer.add_adapter(transformer_config) - self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer") - inputs = self.get_dummy_inputs(torch_device) - images_lora = pipe(**inputs).images - - with tempfile.TemporaryDirectory() as tmpdirname: - transformer_state_dict = get_peft_model_state_dict(pipe.transformer) - - self.pipeline_class.save_lora_weights( - save_directory=tmpdirname, - transformer_lora_layers=transformer_state_dict, - ) - - self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))) - pipe.unload_lora_weights() - - pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")) - - inputs = self.get_dummy_inputs(torch_device) - images_lora_from_pretrained = pipe(**inputs).images - self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer") - - self.assertTrue( - np.allclose(images_lora, images_lora_from_pretrained, atol=1e-3, rtol=1e-3), - "Loading from saved checkpoints should give same results.", - ) - - def test_simple_inference_with_clip_encoders_lora_save_load(self): - components = self.get_dummy_components() - transformer_config = self.get_lora_config_for_transformer() - text_encoder_config = self.get_lora_config_for_text_encoders() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - inputs = self.get_dummy_inputs(torch_device) - - pipe.transformer.add_adapter(transformer_config) - pipe.text_encoder.add_adapter(text_encoder_config) - pipe.text_encoder_2.add_adapter(text_encoder_config) - - self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer") - self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder.") - self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2.") - - inputs = self.get_dummy_inputs(torch_device) - images_lora = pipe(**inputs).images - - with tempfile.TemporaryDirectory() as tmpdirname: - transformer_state_dict = get_peft_model_state_dict(pipe.transformer) - text_encoder_one_state_dict = get_peft_model_state_dict(pipe.text_encoder) - text_encoder_two_state_dict = get_peft_model_state_dict(pipe.text_encoder_2) - - self.pipeline_class.save_lora_weights( - save_directory=tmpdirname, - transformer_lora_layers=transformer_state_dict, - text_encoder_lora_layers=text_encoder_one_state_dict, - text_encoder_2_lora_layers=text_encoder_two_state_dict, - ) - - self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))) - pipe.unload_lora_weights() - - pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")) - - inputs = self.get_dummy_inputs(torch_device) - images_lora_from_pretrained = pipe(**inputs).images - self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer") - self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text_encoder_one") - self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text_encoder_two") - - self.assertTrue( - np.allclose(images_lora, images_lora_from_pretrained, atol=1e-3, rtol=1e-3), - "Loading from saved checkpoints should give same results.", - ) - - def test_simple_inference_with_transformer_lora_and_scale(self): - components = self.get_dummy_components() - transformer_lora_config = self.get_lora_config_for_transformer() - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output_no_lora = pipe(**inputs).images - - pipe.transformer.add_adapter(transformer_lora_config) - self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer") - - inputs = self.get_dummy_inputs(torch_device) - output_lora = pipe(**inputs).images - self.assertTrue( - not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output" - ) - - inputs = self.get_dummy_inputs(torch_device) - output_lora_scale = pipe(**inputs, joint_attention_kwargs={"scale": 0.5}).images - self.assertTrue( - not np.allclose(output_lora, output_lora_scale, atol=1e-3, rtol=1e-3), - "Lora + scale should change the output", - ) - - inputs = self.get_dummy_inputs(torch_device) - output_lora_0_scale = pipe(**inputs, joint_attention_kwargs={"scale": 0.0}).images - self.assertTrue( - np.allclose(output_no_lora, output_lora_0_scale, atol=1e-3, rtol=1e-3), - "Lora + 0 scale should lead to same result as no LoRA", - ) - - def test_simple_inference_with_clip_encoders_lora_and_scale(self): - components = self.get_dummy_components() - transformer_lora_config = self.get_lora_config_for_transformer() - text_encoder_config = self.get_lora_config_for_text_encoders() - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output_no_lora = pipe(**inputs).images - - pipe.transformer.add_adapter(transformer_lora_config) - pipe.text_encoder.add_adapter(text_encoder_config) - pipe.text_encoder_2.add_adapter(text_encoder_config) - self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer") - self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text_encoder_one") - self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text_encoder_two") - - inputs = self.get_dummy_inputs(torch_device) - output_lora = pipe(**inputs).images - self.assertTrue( - not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output" - ) - - inputs = self.get_dummy_inputs(torch_device) - output_lora_scale = pipe(**inputs, joint_attention_kwargs={"scale": 0.5}).images - self.assertTrue( - not np.allclose(output_lora, output_lora_scale, atol=1e-3, rtol=1e-3), - "Lora + scale should change the output", - ) - - inputs = self.get_dummy_inputs(torch_device) - output_lora_0_scale = pipe(**inputs, joint_attention_kwargs={"scale": 0.0}).images - self.assertTrue( - np.allclose(output_no_lora, output_lora_0_scale, atol=1e-3, rtol=1e-3), - "Lora + 0 scale should lead to same result as no LoRA", - ) - - def test_simple_inference_with_transformer_fused(self): - components = self.get_dummy_components() - transformer_lora_config = self.get_lora_config_for_transformer() - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output_no_lora = pipe(**inputs).images - - pipe.transformer.add_adapter(transformer_lora_config) - self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer") - - pipe.fuse_lora() - # Fusing should still keep the LoRA layers - self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer") - - inputs = self.get_dummy_inputs(torch_device) - ouput_fused = pipe(**inputs).images - self.assertFalse( - np.allclose(ouput_fused, output_no_lora, atol=1e-3, rtol=1e-3), "Fused lora should change the output" - ) - - def test_simple_inference_with_transformer_fused_with_no_fusion(self): - components = self.get_dummy_components() - transformer_lora_config = self.get_lora_config_for_transformer() - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output_no_lora = pipe(**inputs).images - - pipe.transformer.add_adapter(transformer_lora_config) - self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer") - inputs = self.get_dummy_inputs(torch_device) - ouput_lora = pipe(**inputs).images - - pipe.fuse_lora() - # Fusing should still keep the LoRA layers - self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer") - - inputs = self.get_dummy_inputs(torch_device) - ouput_fused = pipe(**inputs).images - self.assertFalse( - np.allclose(ouput_fused, output_no_lora, atol=1e-3, rtol=1e-3), "Fused lora should change the output" - ) - self.assertTrue( - np.allclose(ouput_fused, ouput_lora, atol=1e-3, rtol=1e-3), - "Fused lora output should be changed when LoRA isn't fused but still effective.", - ) - - def test_simple_inference_with_transformer_fuse_unfuse(self): - components = self.get_dummy_components() - transformer_lora_config = self.get_lora_config_for_transformer() - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output_no_lora = pipe(**inputs).images - - pipe.transformer.add_adapter(transformer_lora_config) - self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer") - - pipe.fuse_lora() - # Fusing should still keep the LoRA layers - self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer") - inputs = self.get_dummy_inputs(torch_device) - ouput_fused = pipe(**inputs).images - self.assertFalse( - np.allclose(ouput_fused, output_no_lora, atol=1e-3, rtol=1e-3), "Fused lora should change the output" - ) - - pipe.unfuse_lora() - self.assertTrue(check_if_lora_correctly_set(pipe.transformer), "Lora not correctly set in transformer") - inputs = self.get_dummy_inputs(torch_device) - output_unfused_lora = pipe(**inputs).images - self.assertTrue( - np.allclose(ouput_fused, output_unfused_lora, atol=1e-3, rtol=1e-3), "Fused lora should change the output" - ) + scheduler_cls = FlowMatchEulerDiscreteScheduler() + scheduler_kwargs = {} + transformer_kwargs = { + "sample_size": 32, + "patch_size": 1, + "in_channels": 4, + "num_layers": 1, + "attention_head_dim": 8, + "num_attention_heads": 4, + "caption_projection_dim": 32, + "joint_attention_dim": 32, + "pooled_projection_dim": 64, + "out_channels": 4, + } + vae_kwargs = { + "sample_size": 32, + "in_channels": 3, + "out_channels": 3, + "block_out_channels": (4,), + "layers_per_block": 1, + "latent_channels": 4, + "norm_num_groups": 1, + "use_quant_conv": False, + "use_post_quant_conv": False, + "shift_factor": 0.0609, + "scaling_factor": 1.5035, + } + has_three_text_encoders = True @require_torch_gpu def test_sd3_lora(self): diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 9a07727db931..ca2e92832229 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -19,12 +19,14 @@ import numpy as np import torch -from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer +from transformers import AutoTokenizer, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel from diffusers import ( AutoencoderKL, DDIMScheduler, + FlowMatchEulerDiscreteScheduler, LCMScheduler, + SD3Transformer2DModel, UNet2DConditionModel, ) from diffusers.utils.import_utils import is_peft_available @@ -71,28 +73,47 @@ class PeftLoraLoaderMixinTests: scheduler_cls = None scheduler_kwargs = None has_two_text_encoders = False + has_three_text_encoders = False unet_kwargs = None + transformer_kwargs = None vae_kwargs = None def get_dummy_components(self, scheduler_cls=None, use_dora=False): + if self.unet_kwargs and self.transformer_kwargs: + raise ValueError("Both `unet_kwargs` and `transformer_kwargs` cannot be specified.") + if self.has_two_text_encoders and self.has_three_text_encoders: + raise ValueError("Both `has_two_text_encoders` and `has_three_text_encoders` cannot be True.") + scheduler_cls = self.scheduler_cls if scheduler_cls is None else scheduler_cls rank = 4 torch.manual_seed(0) - unet = UNet2DConditionModel(**self.unet_kwargs) + if self.unet_kwargs is not None: + unet = UNet2DConditionModel(**self.unet_kwargs) + else: + transformer = SD3Transformer2DModel(**self.transformer_kwargs) scheduler = scheduler_cls(**self.scheduler_kwargs) torch.manual_seed(0) vae = AutoencoderKL(**self.vae_kwargs) - text_encoder = CLIPTextModel.from_pretrained("peft-internal-testing/tiny-clip-text-2") - tokenizer = CLIPTokenizer.from_pretrained("peft-internal-testing/tiny-clip-text-2") + if not self.has_three_text_encoders: + text_encoder = CLIPTextModel.from_pretrained("peft-internal-testing/tiny-clip-text-2") + tokenizer = CLIPTokenizer.from_pretrained("peft-internal-testing/tiny-clip-text-2") if self.has_two_text_encoders: text_encoder_2 = CLIPTextModelWithProjection.from_pretrained("peft-internal-testing/tiny-clip-text-2") tokenizer_2 = CLIPTokenizer.from_pretrained("peft-internal-testing/tiny-clip-text-2") + if self.has_three_text_encoders: + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + tokenizer_3 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5") + text_encoder = CLIPTextModelWithProjection.from_pretrained("hf-internal-testing/tiny-sd3-text_encoder") + text_encoder_2 = CLIPTextModelWithProjection.from_pretrained("hf-internal-testing/tiny-sd3-text_encoder-2") + text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5") + text_lora_config = LoraConfig( r=rank, lora_alpha=rank, @@ -101,7 +122,7 @@ def get_dummy_components(self, scheduler_cls=None, use_dora=False): use_dora=use_dora, ) - unet_lora_config = LoraConfig( + denoiser_lora_config = LoraConfig( r=rank, lora_alpha=rank, target_modules=["to_q", "to_k", "to_v", "to_out.0"], @@ -109,18 +130,31 @@ def get_dummy_components(self, scheduler_cls=None, use_dora=False): use_dora=use_dora, ) - if self.has_two_text_encoders: - pipeline_components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "text_encoder_2": text_encoder_2, - "tokenizer_2": tokenizer_2, - "image_encoder": None, - "feature_extractor": None, - } + if self.has_two_text_encoders or self.has_three_text_encoders: + if self.unet_kwargs is not None: + pipeline_components = { + "unet": unet, + "scheduler": scheduler, + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "text_encoder_2": text_encoder_2, + "tokenizer_2": tokenizer_2, + "image_encoder": None, + "feature_extractor": None, + } + elif self.has_three_text_encoders and self.transformer_kwargs is not None: + pipeline_components = { + "transformer": transformer, + "scheduler": scheduler, + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "text_encoder_2": text_encoder_2, + "tokenizer_2": tokenizer_2, + "text_encoder_3": text_encoder_3, + "tokenizer_3": tokenizer_3, + } else: pipeline_components = { "unet": unet, @@ -133,7 +167,7 @@ def get_dummy_components(self, scheduler_cls=None, use_dora=False): "image_encoder": None, } - return pipeline_components, text_lora_config, unet_lora_config + return pipeline_components, text_lora_config, denoiser_lora_config def get_dummy_inputs(self, with_generator=True): batch_size = 1 @@ -170,7 +204,12 @@ def test_simple_inference(self): """ Tests a simple inference and makes sure it works as expected """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) @@ -178,14 +217,20 @@ def test_simple_inference(self): _, _, inputs = self.get_dummy_inputs() output_no_lora = pipe(**inputs).images - self.assertTrue(output_no_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_lora.shape == shape_to_be_checked) def test_simple_inference_with_text_lora(self): """ Tests a simple inference with lora attached on the text encoder and makes sure it works as expected """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) @@ -193,12 +238,13 @@ def test_simple_inference_with_text_lora(self): _, _, inputs = self.get_dummy_inputs(with_generator=False) output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images - self.assertTrue(output_no_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_lora.shape == shape_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -214,7 +260,12 @@ def test_simple_inference_with_text_lora_and_scale(self): Tests a simple inference with lora attached on the text encoder + scale argument and makes sure it works as expected """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) @@ -222,12 +273,13 @@ def test_simple_inference_with_text_lora_and_scale(self): _, _, inputs = self.get_dummy_inputs(with_generator=False) output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images - self.assertTrue(output_no_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_lora.shape == shape_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -238,17 +290,27 @@ def test_simple_inference_with_text_lora_and_scale(self): not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output" ) - output_lora_scale = pipe( - **inputs, generator=torch.manual_seed(0), cross_attention_kwargs={"scale": 0.5} - ).images + if self.unet_kwargs is not None: + output_lora_scale = pipe( + **inputs, generator=torch.manual_seed(0), cross_attention_kwargs={"scale": 0.5} + ).images + else: + output_lora_scale = pipe( + **inputs, generator=torch.manual_seed(0), joint_attention_kwargs={"scale": 0.5} + ).images self.assertTrue( not np.allclose(output_lora, output_lora_scale, atol=1e-3, rtol=1e-3), "Lora + scale should change the output", ) - output_lora_0_scale = pipe( - **inputs, generator=torch.manual_seed(0), cross_attention_kwargs={"scale": 0.0} - ).images + if self.unet_kwargs is not None: + output_lora_0_scale = pipe( + **inputs, generator=torch.manual_seed(0), cross_attention_kwargs={"scale": 0.0} + ).images + else: + output_lora_0_scale = pipe( + **inputs, generator=torch.manual_seed(0), joint_attention_kwargs={"scale": 0.0} + ).images self.assertTrue( np.allclose(output_no_lora, output_lora_0_scale, atol=1e-3, rtol=1e-3), "Lora + 0 scale should lead to same result as no LoRA", @@ -259,7 +321,12 @@ def test_simple_inference_with_text_lora_fused(self): Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model and makes sure it works as expected """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) @@ -267,12 +334,13 @@ def test_simple_inference_with_text_lora_fused(self): _, _, inputs = self.get_dummy_inputs(with_generator=False) output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images - self.assertTrue(output_no_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_lora.shape == shape_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -282,7 +350,7 @@ def test_simple_inference_with_text_lora_fused(self): # Fusing should still keep the LoRA layers self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" ) @@ -297,7 +365,12 @@ def test_simple_inference_with_text_lora_unloaded(self): Tests a simple inference with lora attached to text encoder, then unloads the lora weights and makes sure it works as expected """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) @@ -305,12 +378,13 @@ def test_simple_inference_with_text_lora_unloaded(self): _, _, inputs = self.get_dummy_inputs(with_generator=False) output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images - self.assertTrue(output_no_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_lora.shape == shape_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -322,7 +396,7 @@ def test_simple_inference_with_text_lora_unloaded(self): check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly unloaded in text encoder" ) - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: self.assertFalse( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly unloaded in text encoder 2", @@ -338,7 +412,12 @@ def test_simple_inference_with_text_lora_save_load(self): """ Tests a simple usecase where users could use saving utilities for LoRA. """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) @@ -346,12 +425,13 @@ def test_simple_inference_with_text_lora_save_load(self): _, _, inputs = self.get_dummy_inputs(with_generator=False) output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images - self.assertTrue(output_no_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_lora.shape == shape_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -361,7 +441,7 @@ def test_simple_inference_with_text_lora_save_load(self): with tempfile.TemporaryDirectory() as tmpdirname: text_encoder_state_dict = get_peft_model_state_dict(pipe.text_encoder) - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: text_encoder_2_state_dict = get_peft_model_state_dict(pipe.text_encoder_2) self.pipeline_class.save_lora_weights( @@ -385,7 +465,7 @@ def test_simple_inference_with_text_lora_save_load(self): images_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0)).images self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" ) @@ -401,9 +481,14 @@ def test_simple_inference_with_partial_text_lora(self): with different ranks and some adapters removed and makes sure it works as expected """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: components, _, _ = self.get_dummy_components(scheduler_cls) - # Verify `LoraLoaderMixin.load_lora_into_text_encoder` handles different ranks per module (PR#8324). + # Verify `StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder` handles different ranks per module (PR#8324). text_lora_config = LoraConfig( r=4, rank_pattern={"q_proj": 1, "k_proj": 2, "v_proj": 3}, @@ -418,7 +503,8 @@ def test_simple_inference_with_partial_text_lora(self): _, _, inputs = self.get_dummy_inputs(with_generator=False) output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images - self.assertTrue(output_no_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_lora.shape == shape_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") @@ -430,7 +516,7 @@ def test_simple_inference_with_partial_text_lora(self): if "text_model.encoder.layers.4" not in module_name } - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -462,7 +548,12 @@ def test_simple_inference_save_pretrained(self): """ Tests a simple usecase where users could use saving utilities for LoRA through save_pretrained """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) @@ -470,12 +561,13 @@ def test_simple_inference_save_pretrained(self): _, _, inputs = self.get_dummy_inputs(with_generator=False) output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images - self.assertTrue(output_no_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_lora.shape == shape_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -494,7 +586,7 @@ def test_simple_inference_save_pretrained(self): "Lora not correctly set in text encoder", ) - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: self.assertTrue( check_if_lora_correctly_set(pipe_from_pretrained.text_encoder_2), "Lora not correctly set in text encoder 2", @@ -507,27 +599,42 @@ def test_simple_inference_save_pretrained(self): "Loading from saved checkpoints should give same results.", ) - def test_simple_inference_with_text_unet_lora_save_load(self): + def test_simple_inference_with_text_denoiser_lora_save_load(self): """ Tests a simple usecase where users could use saving utilities for LoRA for Unet + text encoder """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) _, _, inputs = self.get_dummy_inputs(with_generator=False) output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images - self.assertTrue(output_no_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_lora.shape == shape_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config) - pipe.unet.add_adapter(unet_lora_config) + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config) + else: + pipe.transformer.add_adapter(denoiser_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in Unet") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -537,22 +644,36 @@ def test_simple_inference_with_text_unet_lora_save_load(self): with tempfile.TemporaryDirectory() as tmpdirname: text_encoder_state_dict = get_peft_model_state_dict(pipe.text_encoder) - unet_state_dict = get_peft_model_state_dict(pipe.unet) - if self.has_two_text_encoders: + + if self.unet_kwargs is not None: + denoiser_state_dict = get_peft_model_state_dict(pipe.unet) + else: + denoiser_state_dict = get_peft_model_state_dict(pipe.transformer) + + if self.has_two_text_encoders or self.has_three_text_encoders: text_encoder_2_state_dict = get_peft_model_state_dict(pipe.text_encoder_2) - self.pipeline_class.save_lora_weights( - save_directory=tmpdirname, - text_encoder_lora_layers=text_encoder_state_dict, - text_encoder_2_lora_layers=text_encoder_2_state_dict, - unet_lora_layers=unet_state_dict, - safe_serialization=False, - ) + if self.unet_kwargs is not None: + self.pipeline_class.save_lora_weights( + save_directory=tmpdirname, + text_encoder_lora_layers=text_encoder_state_dict, + text_encoder_2_lora_layers=text_encoder_2_state_dict, + unet_lora_layers=denoiser_state_dict, + safe_serialization=False, + ) + else: + self.pipeline_class.save_lora_weights( + save_directory=tmpdirname, + text_encoder_lora_layers=text_encoder_state_dict, + text_encoder_2_lora_layers=text_encoder_2_state_dict, + transformer_lora_layers=denoiser_state_dict, + safe_serialization=False, + ) else: self.pipeline_class.save_lora_weights( save_directory=tmpdirname, text_encoder_lora_layers=text_encoder_state_dict, - unet_lora_layers=unet_state_dict, + unet_lora_layers=denoiser_state_dict, safe_serialization=False, ) @@ -563,9 +684,10 @@ def test_simple_inference_with_text_unet_lora_save_load(self): images_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0)).images self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" ) @@ -575,27 +697,37 @@ def test_simple_inference_with_text_unet_lora_save_load(self): "Loading from saved checkpoints should give same results.", ) - def test_simple_inference_with_text_unet_lora_and_scale(self): + def test_simple_inference_with_text_denoiser_lora_and_scale(self): """ Tests a simple inference with lora attached on the text encoder + Unet + scale argument and makes sure it works as expected """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) _, _, inputs = self.get_dummy_inputs(with_generator=False) output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images - self.assertTrue(output_no_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_lora.shape == shape_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config) - pipe.unet.add_adapter(unet_lora_config) + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config) + else: + pipe.transformer.add_adapter(denoiser_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -606,17 +738,27 @@ def test_simple_inference_with_text_unet_lora_and_scale(self): not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output" ) - output_lora_scale = pipe( - **inputs, generator=torch.manual_seed(0), cross_attention_kwargs={"scale": 0.5} - ).images + if self.unet_kwargs is not None: + output_lora_scale = pipe( + **inputs, generator=torch.manual_seed(0), cross_attention_kwargs={"scale": 0.5} + ).images + else: + output_lora_scale = pipe( + **inputs, generator=torch.manual_seed(0), joint_attention_kwargs={"scale": 0.5} + ).images self.assertTrue( not np.allclose(output_lora, output_lora_scale, atol=1e-3, rtol=1e-3), "Lora + scale should change the output", ) - output_lora_0_scale = pipe( - **inputs, generator=torch.manual_seed(0), cross_attention_kwargs={"scale": 0.0} - ).images + if self.unet_kwargs is not None: + output_lora_0_scale = pipe( + **inputs, generator=torch.manual_seed(0), cross_attention_kwargs={"scale": 0.0} + ).images + else: + output_lora_0_scale = pipe( + **inputs, generator=torch.manual_seed(0), joint_attention_kwargs={"scale": 0.0} + ).images self.assertTrue( np.allclose(output_no_lora, output_lora_0_scale, atol=1e-3, rtol=1e-3), "Lora + 0 scale should lead to same result as no LoRA", @@ -627,28 +769,38 @@ def test_simple_inference_with_text_unet_lora_and_scale(self): "The scaling parameter has not been correctly restored!", ) - def test_simple_inference_with_text_lora_unet_fused(self): + def test_simple_inference_with_text_lora_denoiser_fused(self): """ Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model and makes sure it works as expected - with unet """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) _, _, inputs = self.get_dummy_inputs(with_generator=False) output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images - self.assertTrue(output_no_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_lora.shape == shape_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config) - pipe.unet.add_adapter(unet_lora_config) + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config) + else: + pipe.transformer.add_adapter(denoiser_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -657,9 +809,10 @@ def test_simple_inference_with_text_lora_unet_fused(self): pipe.fuse_lora() # Fusing should still keep the LoRA layers self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" ) @@ -669,27 +822,37 @@ def test_simple_inference_with_text_lora_unet_fused(self): np.allclose(ouput_fused, output_no_lora, atol=1e-3, rtol=1e-3), "Fused lora should change the output" ) - def test_simple_inference_with_text_unet_lora_unloaded(self): + def test_simple_inference_with_text_denoiser_lora_unloaded(self): """ Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights and makes sure it works as expected """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) _, _, inputs = self.get_dummy_inputs(with_generator=False) output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images - self.assertTrue(output_no_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_lora.shape == shape_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config) - pipe.unet.add_adapter(unet_lora_config) + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config) + else: + pipe.transformer.add_adapter(denoiser_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -700,9 +863,12 @@ def test_simple_inference_with_text_unet_lora_unloaded(self): self.assertFalse( check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly unloaded in text encoder" ) - self.assertFalse(check_if_lora_correctly_set(pipe.unet), "Lora not correctly unloaded in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertFalse( + check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly unloaded in denoiser" + ) - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: self.assertFalse( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly unloaded in text encoder 2", @@ -714,25 +880,34 @@ def test_simple_inference_with_text_unet_lora_unloaded(self): "Fused lora should change the output", ) - def test_simple_inference_with_text_unet_lora_unfused(self): + def test_simple_inference_with_text_denoiser_lora_unfused(self): """ Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights and makes sure it works as expected """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) _, _, inputs = self.get_dummy_inputs(with_generator=False) pipe.text_encoder.add_adapter(text_lora_config) - pipe.unet.add_adapter(unet_lora_config) + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config) + else: + pipe.transformer.add_adapter(denoiser_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -747,9 +922,10 @@ def test_simple_inference_with_text_unet_lora_unfused(self): output_unfused_lora = pipe(**inputs, generator=torch.manual_seed(0)).images # unloading should remove the LoRA layers self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Unfuse should still keep LoRA layers") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Unfuse should still keep LoRA layers") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Unfuse should still keep LoRA layers") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Unfuse should still keep LoRA layers" ) @@ -760,13 +936,18 @@ def test_simple_inference_with_text_unet_lora_unfused(self): "Fused lora should change the output", ) - def test_simple_inference_with_text_unet_multi_adapter(self): + def test_simple_inference_with_text_denoiser_multi_adapter(self): """ Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set them """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -777,13 +958,20 @@ def test_simple_inference_with_text_unet_multi_adapter(self): pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") - pipe.unet.add_adapter(unet_lora_config, "adapter-1") - pipe.unet.add_adapter(unet_lora_config, "adapter-2") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-1") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-2") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-2") self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") self.assertTrue( @@ -826,13 +1014,21 @@ def test_simple_inference_with_text_unet_multi_adapter(self): "output with no lora and output with lora disabled should give same results", ) - def test_simple_inference_with_text_unet_block_scale(self): + def test_simple_inference_with_text_denoiser_block_scale(self): """ Tests a simple inference with lora attached to text encoder and unet, attaches one adapter and set differnt weights for different blocks (i.e. block lora) """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + if self.pipeline_class.__name__ == "StableDiffusion3Pipeline": + return + + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -841,12 +1037,16 @@ def test_simple_inference_with_text_unet_block_scale(self): output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - pipe.unet.add_adapter(unet_lora_config, "adapter-1") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-1") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1") self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -881,13 +1081,21 @@ def test_simple_inference_with_text_unet_block_scale(self): "output with no lora and output with lora disabled should give same results", ) - def test_simple_inference_with_text_unet_multi_adapter_block_lora(self): + def test_simple_inference_with_text_denoiser_multi_adapter_block_lora(self): """ Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set differnt weights for different blocks (i.e. block lora) """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + if self.pipeline_class.__name__ == "StableDiffusion3Pipeline": + return + + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -898,13 +1106,20 @@ def test_simple_inference_with_text_unet_multi_adapter_block_lora(self): pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") - pipe.unet.add_adapter(unet_lora_config, "adapter-1") - pipe.unet.add_adapter(unet_lora_config, "adapter-2") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-1") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-2") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-2") self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") self.assertTrue( @@ -953,8 +1168,10 @@ def test_simple_inference_with_text_unet_multi_adapter_block_lora(self): with self.assertRaises(ValueError): pipe.set_adapters(["adapter-1", "adapter-2"], [scales_1]) - def test_simple_inference_with_text_unet_block_scale_for_all_dict_options(self): + def test_simple_inference_with_text_denoiser_block_scale_for_all_dict_options(self): """Tests that any valid combination of lora block scales can be used in pipe.set_adapter""" + if self.pipeline_class.__name__ == "StableDiffusion3Pipeline": + return def updown_options(blocks_with_tf, layers_per_block, value): """ @@ -1019,16 +1236,19 @@ def all_possible_dict_opts(unet, value): return opts - components, text_lora_config, unet_lora_config = self.get_dummy_components(self.scheduler_cls) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(self.scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) _, _, inputs = self.get_dummy_inputs(with_generator=False) pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - pipe.unet.add_adapter(unet_lora_config, "adapter-1") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-1") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") for scale_dict in all_possible_dict_opts(pipe.unet, value=1234): @@ -1038,13 +1258,18 @@ def all_possible_dict_opts(unet, value): pipe.set_adapters("adapter-1", scale_dict) # test will fail if this line throws an error - def test_simple_inference_with_text_unet_multi_adapter_delete_adapter(self): + def test_simple_inference_with_text_denoiser_multi_adapter_delete_adapter(self): """ Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set/delete them """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -1055,13 +1280,20 @@ def test_simple_inference_with_text_unet_multi_adapter_delete_adapter(self): pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") - pipe.unet.add_adapter(unet_lora_config, "adapter-1") - pipe.unet.add_adapter(unet_lora_config, "adapter-2") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-1") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-2") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-2") self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") self.assertTrue( @@ -1113,8 +1345,14 @@ def test_simple_inference_with_text_unet_multi_adapter_delete_adapter(self): pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") - pipe.unet.add_adapter(unet_lora_config, "adapter-1") - pipe.unet.add_adapter(unet_lora_config, "adapter-2") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-1") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-2") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-2") pipe.set_adapters(["adapter-1", "adapter-2"]) pipe.delete_adapters(["adapter-1", "adapter-2"]) @@ -1126,13 +1364,18 @@ def test_simple_inference_with_text_unet_multi_adapter_delete_adapter(self): "output with no lora and output with lora disabled should give same results", ) - def test_simple_inference_with_text_unet_multi_adapter_weighted(self): + def test_simple_inference_with_text_denoiser_multi_adapter_weighted(self): """ Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set them """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -1143,13 +1386,20 @@ def test_simple_inference_with_text_unet_multi_adapter_weighted(self): pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") - pipe.unet.add_adapter(unet_lora_config, "adapter-1") - pipe.unet.add_adapter(unet_lora_config, "adapter-2") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-1") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-2") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-2") self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") self.assertTrue( @@ -1202,8 +1452,13 @@ def test_simple_inference_with_text_unet_multi_adapter_weighted(self): @skip_mps def test_lora_fuse_nan(self): - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -1211,16 +1466,23 @@ def test_lora_fuse_nan(self): pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - pipe.unet.add_adapter(unet_lora_config, "adapter-1") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-1") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1") self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") # corrupt one LoRA weight with `inf` values with torch.no_grad(): - pipe.unet.mid_block.attentions[0].transformer_blocks[0].attn1.to_q.lora_A["adapter-1"].weight += float( - "inf" - ) + if self.unet_kwargs: + pipe.unet.mid_block.attentions[0].transformer_blocks[0].attn1.to_q.lora_A[ + "adapter-1" + ].weight += float("inf") + else: + pipe.transformer.transformer_blocks[0].attn.to_q.lora_A["adapter-1"].weight += float("inf") # with `safe_fusing=True` we should see an Error with self.assertRaises(ValueError): @@ -1238,21 +1500,32 @@ def test_get_adapters(self): Tests a simple usecase where we attach multiple adapters and check if the results are the expected results """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) _, _, inputs = self.get_dummy_inputs(with_generator=False) pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - pipe.unet.add_adapter(unet_lora_config, "adapter-1") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-1") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1") adapter_names = pipe.get_active_adapters() self.assertListEqual(adapter_names, ["adapter-1"]) pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") - pipe.unet.add_adapter(unet_lora_config, "adapter-2") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-2") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-2") adapter_names = pipe.get_active_adapters() self.assertListEqual(adapter_names, ["adapter-2"]) @@ -1265,65 +1538,108 @@ def test_get_list_adapters(self): Tests a simple usecase where we attach multiple adapters and check if the results are the expected results """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - pipe.unet.add_adapter(unet_lora_config, "adapter-1") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-1") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1") adapter_names = pipe.get_list_adapters() - self.assertDictEqual(adapter_names, {"text_encoder": ["adapter-1"], "unet": ["adapter-1"]}) + dicts_to_be_checked = {"text_encoder": ["adapter-1"]} + if self.unet_kwargs is not None: + dicts_to_be_checked.update({"unet": ["adapter-1"]}) + else: + dicts_to_be_checked.update({"transformer": ["adapter-1"]}) + self.assertDictEqual(adapter_names, dicts_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") - pipe.unet.add_adapter(unet_lora_config, "adapter-2") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-2") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-2") adapter_names = pipe.get_list_adapters() - self.assertDictEqual( - adapter_names, {"text_encoder": ["adapter-1", "adapter-2"], "unet": ["adapter-1", "adapter-2"]} - ) + dicts_to_be_checked = {"text_encoder": ["adapter-1", "adapter-2"]} + if self.unet_kwargs is not None: + dicts_to_be_checked.update({"unet": ["adapter-1", "adapter-2"]}) + else: + dicts_to_be_checked.update({"transformer": ["adapter-1", "adapter-2"]}) + self.assertDictEqual(adapter_names, dicts_to_be_checked) pipe.set_adapters(["adapter-1", "adapter-2"]) + dicts_to_be_checked = {"text_encoder": ["adapter-1", "adapter-2"]} + if self.unet_kwargs is not None: + dicts_to_be_checked.update({"unet": ["adapter-1", "adapter-2"]}) + else: + dicts_to_be_checked.update({"transformer": ["adapter-1", "adapter-2"]}) self.assertDictEqual( pipe.get_list_adapters(), - {"unet": ["adapter-1", "adapter-2"], "text_encoder": ["adapter-1", "adapter-2"]}, + dicts_to_be_checked, ) - pipe.unet.add_adapter(unet_lora_config, "adapter-3") - self.assertDictEqual( - pipe.get_list_adapters(), - {"unet": ["adapter-1", "adapter-2", "adapter-3"], "text_encoder": ["adapter-1", "adapter-2"]}, - ) + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-3") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-3") + + dicts_to_be_checked = {"text_encoder": ["adapter-1", "adapter-2"]} + if self.unet_kwargs is not None: + dicts_to_be_checked.update({"unet": ["adapter-1", "adapter-2", "adapter-3"]}) + else: + dicts_to_be_checked.update({"transformer": ["adapter-1", "adapter-2", "adapter-3"]}) + self.assertDictEqual(pipe.get_list_adapters(), dicts_to_be_checked) @require_peft_version_greater(peft_version="0.6.2") - def test_simple_inference_with_text_lora_unet_fused_multi(self): + def test_simple_inference_with_text_lora_denoiser_fused_multi(self): """ Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model and makes sure it works as expected - with unet and multi-adapter case """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) _, _, inputs = self.get_dummy_inputs(with_generator=False) output_no_lora = pipe(**inputs, generator=torch.manual_seed(0)).images - self.assertTrue(output_no_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_lora.shape == shape_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - pipe.unet.add_adapter(unet_lora_config, "adapter-1") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-1") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1") # Attach a second adapter pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") - pipe.unet.add_adapter(unet_lora_config, "adapter-2") + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-2") + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-2") self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") self.assertTrue( @@ -1359,23 +1675,35 @@ def test_simple_inference_with_text_lora_unet_fused_multi(self): @require_peft_version_greater(peft_version="0.9.0") def test_simple_inference_with_dora(self): - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls, use_dora=True) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components( + scheduler_cls, use_dora=True + ) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) _, _, inputs = self.get_dummy_inputs(with_generator=False) output_no_dora_lora = pipe(**inputs, generator=torch.manual_seed(0)).images - self.assertTrue(output_no_dora_lora.shape == (1, 64, 64, 3)) + shape_to_be_checked = (1, 64, 64, 3) if self.unet_kwargs is not None else (1, 32, 32, 3) + self.assertTrue(output_no_dora_lora.shape == shape_to_be_checked) pipe.text_encoder.add_adapter(text_lora_config) - pipe.unet.add_adapter(unet_lora_config) + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config) + else: + pipe.transformer.add_adapter(denoiser_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -1389,25 +1717,34 @@ def test_simple_inference_with_dora(self): ) @unittest.skip("This is failing for now - need to investigate") - def test_simple_inference_with_text_unet_lora_unfused_torch_compile(self): + def test_simple_inference_with_text_denoiser_lora_unfused_torch_compile(self): """ Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights and makes sure it works as expected """ - for scheduler_cls in [DDIMScheduler, LCMScheduler]: - components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) _, _, inputs = self.get_dummy_inputs(with_generator=False) pipe.text_encoder.add_adapter(text_lora_config) - pipe.unet.add_adapter(unet_lora_config) + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config) + else: + pipe.transformer.add_adapter(denoiser_lora_config) self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - self.assertTrue(check_if_lora_correctly_set(pipe.unet), "Lora not correctly set in Unet") + denoiser_to_checked = pipe.unet if self.unet_kwargs is not None else pipe.transformer + self.assertTrue(check_if_lora_correctly_set(denoiser_to_checked), "Lora not correctly set in denoiser") - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" @@ -1416,19 +1753,27 @@ def test_simple_inference_with_text_unet_lora_unfused_torch_compile(self): pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) pipe.text_encoder = torch.compile(pipe.text_encoder, mode="reduce-overhead", fullgraph=True) - if self.has_two_text_encoders: + if self.has_two_text_encoders or self.has_three_text_encoders: pipe.text_encoder_2 = torch.compile(pipe.text_encoder_2, mode="reduce-overhead", fullgraph=True) # Just makes sure it works.. _ = pipe(**inputs, generator=torch.manual_seed(0)).images def test_modify_padding_mode(self): + if self.pipeline_class.__name__ == "StableDiffusion3Pipeline": + return + def set_pad_mode(network, mode="circular"): for _, module in network.named_modules(): if isinstance(module, torch.nn.Conv2d): module.padding_mode = mode - for scheduler_cls in [DDIMScheduler, LCMScheduler]: + scheduler_classes = ( + [FlowMatchEulerDiscreteScheduler] + if self.has_three_text_encoders and self.transformer_kwargs + else [DDIMScheduler, LCMScheduler] + ) + for scheduler_cls in scheduler_classes: components, _, _ = self.get_dummy_components(scheduler_cls) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index 87ed1d9d17e5..64722e2d9797 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -40,6 +40,7 @@ ) from diffusers.training_utils import EMAModel from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME, is_torch_npu_available, is_xformers_available, logging +from diffusers.utils.hub_utils import _add_variant from diffusers.utils.testing_utils import ( CaptureLogger, get_python_version, @@ -123,11 +124,9 @@ def test_cached_files_are_used_when_no_internet(self): if p1.data.ne(p2.data).sum() > 0: assert False, "Parameters not the same!" + @unittest.skip("Flaky behaviour on CI. Re-enable after migrating to new runners") + @unittest.skipIf(torch_device == "mps", reason="Test not supported for MPS.") def test_one_request_upon_cached(self): - # TODO: For some reason this test fails on MPS where no HEAD call is made. - if torch_device == "mps": - return - use_safetensors = False with tempfile.TemporaryDirectory() as tmpdirname: @@ -915,6 +914,43 @@ def test_sharded_checkpoints(self): self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5)) + @require_torch_gpu + def test_sharded_checkpoints_with_variant(self): + torch.manual_seed(0) + config, inputs_dict = self.prepare_init_args_and_inputs_for_common() + model = self.model_class(**config).eval() + model = model.to(torch_device) + + base_output = model(**inputs_dict) + + model_size = compute_module_sizes(model)[""] + max_shard_size = int((model_size * 0.75) / (2**10)) # Convert to KB as these test models are small. + variant = "fp16" + with tempfile.TemporaryDirectory() as tmp_dir: + # It doesn't matter if the actual model is in fp16 or not. Just adding the variant and + # testing if loading works with the variant when the checkpoint is sharded should be + # enough. + model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB", variant=variant) + index_filename = _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant) + self.assertTrue(os.path.exists(os.path.join(tmp_dir, index_filename))) + + # Now check if the right number of shards exists. First, let's get the number of shards. + # Since this number can be dependent on the model being tested, it's important that we calculate it + # instead of hardcoding it. + expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, index_filename)) + actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")]) + self.assertTrue(actual_num_shards == expected_num_shards) + + new_model = self.model_class.from_pretrained(tmp_dir, variant=variant).eval() + new_model = new_model.to(torch_device) + + torch.manual_seed(0) + if "generator" in inputs_dict: + _, inputs_dict = self.prepare_init_args_and_inputs_for_common() + new_output = new_model(**inputs_dict) + + self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5)) + @require_torch_gpu def test_sharded_checkpoints_device_map(self): config, inputs_dict = self.prepare_init_args_and_inputs_for_common() diff --git a/tests/models/transformers/test_models_transformer_temporal.py b/tests/models/transformers/test_models_transformer_temporal.py new file mode 100644 index 000000000000..7b689447cf29 --- /dev/null +++ b/tests/models/transformers/test_models_transformer_temporal.py @@ -0,0 +1,67 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import torch + +from diffusers.models.transformers import TransformerTemporalModel +from diffusers.utils.testing_utils import ( + enable_full_determinism, + torch_device, +) + +from ..test_modeling_common import ModelTesterMixin + + +enable_full_determinism() + + +class TemporalTransformerTests(ModelTesterMixin, unittest.TestCase): + model_class = TransformerTemporalModel + main_input_name = "hidden_states" + + @property + def dummy_input(self): + batch_size = 2 + num_channels = 4 + height = width = 32 + + hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device) + timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device) + + return { + "hidden_states": hidden_states, + "timestep": timestep, + } + + @property + def input_shape(self): + return (4, 32, 32) + + @property + def output_shape(self): + return (4, 32, 32) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = { + "num_attention_heads": 8, + "attention_head_dim": 4, + "in_channels": 4, + "num_layers": 1, + "norm_num_groups": 1, + } + inputs_dict = self.dummy_input + return init_dict, inputs_dict diff --git a/tests/pipelines/animatediff/test_animatediff.py b/tests/pipelines/animatediff/test_animatediff.py index 0db240a2855d..dd636b7ce669 100644 --- a/tests/pipelines/animatediff/test_animatediff.py +++ b/tests/pipelines/animatediff/test_animatediff.py @@ -10,7 +10,10 @@ AnimateDiffPipeline, AutoencoderKL, DDIMScheduler, + DPMSolverMultistepScheduler, + LCMScheduler, MotionAdapter, + StableDiffusionPipeline, UNet2DConditionModel, UNetMotionModel, ) @@ -51,16 +54,19 @@ class AnimateDiffPipelineFastTests( ) def get_dummy_components(self): + cross_attention_dim = 8 + block_out_channels = (8, 8) + torch.manual_seed(0) unet = UNet2DConditionModel( - block_out_channels=(32, 64), + block_out_channels=block_out_channels, layers_per_block=2, - sample_size=32, + sample_size=8, in_channels=4, out_channels=4, down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, + cross_attention_dim=cross_attention_dim, norm_num_groups=2, ) scheduler = DDIMScheduler( @@ -71,18 +77,19 @@ def get_dummy_components(self): ) torch.manual_seed(0) vae = AutoencoderKL( - block_out_channels=[32, 64], + block_out_channels=block_out_channels, in_channels=3, out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=4, + norm_num_groups=2, ) torch.manual_seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, eos_token_id=2, - hidden_size=32, + hidden_size=cross_attention_dim, intermediate_size=37, layer_norm_eps=1e-05, num_attention_heads=4, @@ -92,8 +99,9 @@ def get_dummy_components(self): ) text_encoder = CLIPTextModel(text_encoder_config) tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + torch.manual_seed(0) motion_adapter = MotionAdapter( - block_out_channels=(32, 64), + block_out_channels=block_out_channels, motion_layers_per_block=2, motion_norm_num_groups=2, motion_num_attention_heads=4, @@ -126,6 +134,36 @@ def get_dummy_inputs(self, device, seed=0): } return inputs + def test_from_pipe_consistent_config(self): + assert self.original_pipeline_class == StableDiffusionPipeline + original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe" + original_kwargs = {"requires_safety_checker": False} + + # create original_pipeline_class(sd) + pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs) + + # original_pipeline_class(sd) -> pipeline_class + pipe_components = self.get_dummy_components() + pipe_additional_components = {} + for name, component in pipe_components.items(): + if name not in pipe_original.components: + pipe_additional_components[name] = component + + pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components) + + # pipeline_class -> original_pipeline_class(sd) + original_pipe_additional_components = {} + for name, component in pipe_original.components.items(): + if name not in pipe.components or not isinstance(component, pipe.components[name].__class__): + original_pipe_additional_components[name] = component + + pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components) + + # compare the config + original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")} + original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")} + assert original_config_2 == original_config + def test_motion_unet_loading(self): components = self.get_dummy_components() pipe = AnimateDiffPipeline(**components) @@ -141,33 +179,33 @@ def test_ip_adapter_single(self): if torch_device == "cpu": expected_pipe_slice = np.array( [ - 0.5541, - 0.5802, - 0.5074, - 0.4583, - 0.4729, - 0.5374, - 0.4051, - 0.4495, - 0.4480, - 0.5292, - 0.6322, - 0.6265, - 0.5455, - 0.4771, - 0.5795, - 0.5845, - 0.4172, - 0.6066, - 0.6535, - 0.4113, - 0.6833, - 0.5736, - 0.3589, - 0.5730, - 0.4205, - 0.3786, - 0.5323, + 0.5216, + 0.5620, + 0.4927, + 0.5082, + 0.4786, + 0.5932, + 0.5125, + 0.4514, + 0.5315, + 0.4694, + 0.3276, + 0.4863, + 0.3920, + 0.3684, + 0.5745, + 0.4499, + 0.5081, + 0.5414, + 0.6014, + 0.5062, + 0.3630, + 0.5296, + 0.6018, + 0.5098, + 0.4948, + 0.5101, + 0.5620, ] ) return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice) @@ -175,7 +213,7 @@ def test_ip_adapter_single(self): def test_dict_tuple_outputs_equivalent(self): expected_slice = None if torch_device == "cpu": - expected_slice = np.array([0.4051, 0.4495, 0.4480, 0.5845, 0.4172, 0.6066, 0.4205, 0.3786, 0.5323]) + expected_slice = np.array([0.5125, 0.4514, 0.5315, 0.4499, 0.5081, 0.5414, 0.4948, 0.5101, 0.5620]) return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) def test_inference_batch_single_identical( @@ -279,7 +317,7 @@ def test_prompt_embeds(self): inputs = self.get_dummy_inputs(torch_device) inputs.pop("prompt") - inputs["prompt_embeds"] = torch.randn((1, 4, 32), device=torch_device) + inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device) pipe(**inputs) def test_free_init(self): @@ -317,6 +355,52 @@ def test_free_init(self): "Disabling of FreeInit should lead to results similar to the default pipeline results", ) + def test_free_init_with_schedulers(self): + components = self.get_dummy_components() + pipe: AnimateDiffPipeline = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + pipe.to(torch_device) + + inputs_normal = self.get_dummy_inputs(torch_device) + frames_normal = pipe(**inputs_normal).frames[0] + + schedulers_to_test = [ + DPMSolverMultistepScheduler.from_config( + components["scheduler"].config, + timestep_spacing="linspace", + beta_schedule="linear", + algorithm_type="dpmsolver++", + steps_offset=1, + clip_sample=False, + ), + LCMScheduler.from_config( + components["scheduler"].config, + timestep_spacing="linspace", + beta_schedule="linear", + steps_offset=1, + clip_sample=False, + ), + ] + components.pop("scheduler") + + for scheduler in schedulers_to_test: + components["scheduler"] = scheduler + pipe: AnimateDiffPipeline = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + pipe.to(torch_device) + + pipe.enable_free_init(num_iters=2, use_fast_sampling=False) + + inputs = self.get_dummy_inputs(torch_device) + frames_enable_free_init = pipe(**inputs).frames[0] + sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum() + + self.assertGreater( + sum_enabled, + 1e1, + "Enabling of FreeInit should lead to results different from the default pipeline results", + ) + @unittest.skipIf( torch_device != "cuda" or not is_xformers_available(), reason="XFormers attention is only available with CUDA and `xformers` installed", diff --git a/tests/pipelines/animatediff/test_animatediff_sparsectrl.py b/tests/pipelines/animatediff/test_animatediff_sparsectrl.py new file mode 100644 index 000000000000..5d8a7228118d --- /dev/null +++ b/tests/pipelines/animatediff/test_animatediff_sparsectrl.py @@ -0,0 +1,478 @@ +import unittest + +import numpy as np +import torch +from PIL import Image +from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer + +import diffusers +from diffusers import ( + AnimateDiffSparseControlNetPipeline, + AutoencoderKL, + DDIMScheduler, + DPMSolverMultistepScheduler, + LCMScheduler, + MotionAdapter, + SparseControlNetModel, + StableDiffusionPipeline, + UNet2DConditionModel, + UNetMotionModel, +) +from diffusers.utils import logging +from diffusers.utils.testing_utils import torch_device + +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import ( + IPAdapterTesterMixin, + PipelineFromPipeTesterMixin, + PipelineTesterMixin, + SDFunctionTesterMixin, +) + + +def to_np(tensor): + if isinstance(tensor, torch.Tensor): + tensor = tensor.detach().cpu().numpy() + + return tensor + + +class AnimateDiffSparseControlNetPipelineFastTests( + IPAdapterTesterMixin, SDFunctionTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase +): + pipeline_class = AnimateDiffSparseControlNetPipeline + params = TEXT_TO_IMAGE_PARAMS + batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + required_optional_params = frozenset( + [ + "num_inference_steps", + "generator", + "latents", + "return_dict", + "callback_on_step_end", + "callback_on_step_end_tensor_inputs", + ] + ) + + def get_dummy_components(self): + cross_attention_dim = 8 + block_out_channels = (8, 8) + + torch.manual_seed(0) + unet = UNet2DConditionModel( + block_out_channels=block_out_channels, + layers_per_block=2, + sample_size=8, + in_channels=4, + out_channels=4, + down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=cross_attention_dim, + norm_num_groups=2, + ) + scheduler = DDIMScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="linear", + clip_sample=False, + ) + torch.manual_seed(0) + controlnet = SparseControlNetModel( + block_out_channels=block_out_channels, + layers_per_block=2, + in_channels=4, + conditioning_channels=3, + down_block_types=("CrossAttnDownBlockMotion", "DownBlockMotion"), + cross_attention_dim=cross_attention_dim, + conditioning_embedding_out_channels=(8, 8), + norm_num_groups=1, + use_simplified_condition_embedding=False, + ) + torch.manual_seed(0) + vae = AutoencoderKL( + block_out_channels=block_out_channels, + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + norm_num_groups=2, + ) + torch.manual_seed(0) + text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=cross_attention_dim, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ) + text_encoder = CLIPTextModel(text_encoder_config) + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + motion_adapter = MotionAdapter( + block_out_channels=block_out_channels, + motion_layers_per_block=2, + motion_norm_num_groups=2, + motion_num_attention_heads=4, + ) + + components = { + "unet": unet, + "controlnet": controlnet, + "scheduler": scheduler, + "vae": vae, + "motion_adapter": motion_adapter, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "feature_extractor": None, + "image_encoder": None, + } + return components + + def get_dummy_inputs(self, device, seed: int = 0, num_frames: int = 2): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + video_height = 32 + video_width = 32 + conditioning_frames = [Image.new("RGB", (video_width, video_height))] * num_frames + + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "conditioning_frames": conditioning_frames, + "controlnet_frame_indices": list(range(num_frames)), + "generator": generator, + "num_inference_steps": 2, + "num_frames": num_frames, + "guidance_scale": 7.5, + "output_type": "pt", + } + return inputs + + def test_from_pipe_consistent_config(self): + assert self.original_pipeline_class == StableDiffusionPipeline + original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe" + original_kwargs = {"requires_safety_checker": False} + + # create original_pipeline_class(sd) + pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs) + + # original_pipeline_class(sd) -> pipeline_class + pipe_components = self.get_dummy_components() + pipe_additional_components = {} + for name, component in pipe_components.items(): + if name not in pipe_original.components: + pipe_additional_components[name] = component + + pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components) + + # pipeline_class -> original_pipeline_class(sd) + original_pipe_additional_components = {} + for name, component in pipe_original.components.items(): + if name not in pipe.components or not isinstance(component, pipe.components[name].__class__): + original_pipe_additional_components[name] = component + + pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components) + + # compare the config + original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")} + original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")} + assert original_config_2 == original_config + + def test_motion_unet_loading(self): + components = self.get_dummy_components() + pipe = AnimateDiffSparseControlNetPipeline(**components) + + assert isinstance(pipe.unet, UNetMotionModel) + + @unittest.skip("Attention slicing is not enabled in this pipeline") + def test_attention_slicing_forward_pass(self): + pass + + def test_ip_adapter_single(self): + expected_pipe_slice = None + if torch_device == "cpu": + expected_pipe_slice = np.array( + [ + 0.6604, + 0.4099, + 0.4928, + 0.5706, + 0.5096, + 0.5012, + 0.6051, + 0.5169, + 0.5021, + 0.4864, + 0.4261, + 0.5779, + 0.5822, + 0.4049, + 0.5253, + 0.6160, + 0.4150, + 0.5155, + ] + ) + return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice) + + def test_dict_tuple_outputs_equivalent(self): + expected_slice = None + if torch_device == "cpu": + expected_slice = np.array([0.6051, 0.5169, 0.5021, 0.6160, 0.4150, 0.5155]) + return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) + + def test_inference_batch_single_identical( + self, + batch_size=2, + expected_max_diff=1e-4, + additional_params_copy_to_batched_inputs=["num_inference_steps"], + ): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + for components in pipe.components.values(): + if hasattr(components, "set_default_attn_processor"): + components.set_default_attn_processor() + + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + inputs = self.get_dummy_inputs(torch_device) + # Reset generator in case it is has been used in self.get_dummy_inputs + inputs["generator"] = self.get_generator(0) + + logger = logging.get_logger(pipe.__module__) + logger.setLevel(level=diffusers.logging.FATAL) + + # batchify inputs + batched_inputs = {} + batched_inputs.update(inputs) + + for name in self.batch_params: + if name not in inputs: + continue + + value = inputs[name] + if name == "prompt": + len_prompt = len(value) + batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)] + batched_inputs[name][-1] = 100 * "very long" + + else: + batched_inputs[name] = batch_size * [value] + + if "generator" in inputs: + batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)] + + if "batch_size" in inputs: + batched_inputs["batch_size"] = batch_size + + for arg in additional_params_copy_to_batched_inputs: + batched_inputs[arg] = inputs[arg] + + output = pipe(**inputs) + output_batch = pipe(**batched_inputs) + + assert output_batch[0].shape[0] == batch_size + + max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max() + assert max_diff < expected_max_diff + + def test_inference_batch_single_identical_use_simplified_condition_embedding_true( + self, + batch_size=2, + expected_max_diff=1e-4, + additional_params_copy_to_batched_inputs=["num_inference_steps"], + ): + components = self.get_dummy_components() + + torch.manual_seed(0) + old_controlnet = components.pop("controlnet") + components["controlnet"] = SparseControlNetModel.from_config( + old_controlnet.config, conditioning_channels=4, use_simplified_condition_embedding=True + ) + + pipe = self.pipeline_class(**components) + for components in pipe.components.values(): + if hasattr(components, "set_default_attn_processor"): + components.set_default_attn_processor() + + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + inputs = self.get_dummy_inputs(torch_device) + # Reset generator in case it is has been used in self.get_dummy_inputs + inputs["generator"] = self.get_generator(0) + + logger = logging.get_logger(pipe.__module__) + logger.setLevel(level=diffusers.logging.FATAL) + + # batchify inputs + batched_inputs = {} + batched_inputs.update(inputs) + + for name in self.batch_params: + if name not in inputs: + continue + + value = inputs[name] + if name == "prompt": + len_prompt = len(value) + batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)] + batched_inputs[name][-1] = 100 * "very long" + + else: + batched_inputs[name] = batch_size * [value] + + if "generator" in inputs: + batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)] + + if "batch_size" in inputs: + batched_inputs["batch_size"] = batch_size + + for arg in additional_params_copy_to_batched_inputs: + batched_inputs[arg] = inputs[arg] + + output = pipe(**inputs) + output_batch = pipe(**batched_inputs) + + assert output_batch[0].shape[0] == batch_size + + max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max() + assert max_diff < expected_max_diff + + @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices") + def test_to_device(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + + pipe.to("cpu") + # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components + model_devices = [ + component.device.type for component in pipe.components.values() if hasattr(component, "device") + ] + self.assertTrue(all(device == "cpu" for device in model_devices)) + + output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0] + self.assertTrue(np.isnan(output_cpu).sum() == 0) + + pipe.to("cuda") + model_devices = [ + component.device.type for component in pipe.components.values() if hasattr(component, "device") + ] + self.assertTrue(all(device == "cuda" for device in model_devices)) + + output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0] + self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0) + + def test_to_dtype(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + + # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components + model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] + self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes)) + + pipe.to(dtype=torch.float16) + model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] + self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes)) + + def test_prompt_embeds(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + pipe.to(torch_device) + + inputs = self.get_dummy_inputs(torch_device) + inputs.pop("prompt") + inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device) + pipe(**inputs) + + def test_free_init(self): + components = self.get_dummy_components() + pipe: AnimateDiffSparseControlNetPipeline = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + pipe.to(torch_device) + + inputs_normal = self.get_dummy_inputs(torch_device) + frames_normal = pipe(**inputs_normal).frames[0] + + pipe.enable_free_init( + num_iters=2, + use_fast_sampling=True, + method="butterworth", + order=4, + spatial_stop_frequency=0.25, + temporal_stop_frequency=0.25, + ) + inputs_enable_free_init = self.get_dummy_inputs(torch_device) + frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0] + + pipe.disable_free_init() + inputs_disable_free_init = self.get_dummy_inputs(torch_device) + frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0] + + sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum() + max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max() + self.assertGreater( + sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results" + ) + self.assertLess( + max_diff_disabled, + 1e-4, + "Disabling of FreeInit should lead to results similar to the default pipeline results", + ) + + def test_free_init_with_schedulers(self): + components = self.get_dummy_components() + pipe: AnimateDiffSparseControlNetPipeline = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + pipe.to(torch_device) + + inputs_normal = self.get_dummy_inputs(torch_device) + frames_normal = pipe(**inputs_normal).frames[0] + + schedulers_to_test = [ + DPMSolverMultistepScheduler.from_config( + components["scheduler"].config, + timestep_spacing="linspace", + beta_schedule="linear", + algorithm_type="dpmsolver++", + steps_offset=1, + clip_sample=False, + ), + LCMScheduler.from_config( + components["scheduler"].config, + timestep_spacing="linspace", + beta_schedule="linear", + steps_offset=1, + clip_sample=False, + ), + ] + components.pop("scheduler") + + for scheduler in schedulers_to_test: + components["scheduler"] = scheduler + pipe: AnimateDiffSparseControlNetPipeline = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + pipe.to(torch_device) + + pipe.enable_free_init(num_iters=2, use_fast_sampling=False) + + inputs = self.get_dummy_inputs(torch_device) + frames_enable_free_init = pipe(**inputs).frames[0] + sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum() + + self.assertGreater( + sum_enabled, + 1e1, + "Enabling of FreeInit should lead to results different from the default pipeline results", + ) + + def test_vae_slicing(self): + return super().test_vae_slicing(image_count=2) diff --git a/tests/pipelines/animatediff/test_animatediff_video2video.py b/tests/pipelines/animatediff/test_animatediff_video2video.py index bc847400adc3..ced042b4a702 100644 --- a/tests/pipelines/animatediff/test_animatediff_video2video.py +++ b/tests/pipelines/animatediff/test_animatediff_video2video.py @@ -10,7 +10,10 @@ AnimateDiffVideoToVideoPipeline, AutoencoderKL, DDIMScheduler, + DPMSolverMultistepScheduler, + LCMScheduler, MotionAdapter, + StableDiffusionPipeline, UNet2DConditionModel, UNetMotionModel, ) @@ -46,16 +49,19 @@ class AnimateDiffVideoToVideoPipelineFastTests( ) def get_dummy_components(self): + cross_attention_dim = 8 + block_out_channels = (8, 8) + torch.manual_seed(0) unet = UNet2DConditionModel( - block_out_channels=(32, 64), + block_out_channels=block_out_channels, layers_per_block=2, - sample_size=32, + sample_size=8, in_channels=4, out_channels=4, down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, + cross_attention_dim=cross_attention_dim, norm_num_groups=2, ) scheduler = DDIMScheduler( @@ -66,18 +72,19 @@ def get_dummy_components(self): ) torch.manual_seed(0) vae = AutoencoderKL( - block_out_channels=[32, 64], + block_out_channels=block_out_channels, in_channels=3, out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=4, + norm_num_groups=2, ) torch.manual_seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, eos_token_id=2, - hidden_size=32, + hidden_size=cross_attention_dim, intermediate_size=37, layer_norm_eps=1e-05, num_attention_heads=4, @@ -87,8 +94,9 @@ def get_dummy_components(self): ) text_encoder = CLIPTextModel(text_encoder_config) tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + torch.manual_seed(0) motion_adapter = MotionAdapter( - block_out_channels=(32, 64), + block_out_channels=block_out_channels, motion_layers_per_block=2, motion_norm_num_groups=2, motion_num_attention_heads=4, @@ -127,6 +135,36 @@ def get_dummy_inputs(self, device, seed=0): } return inputs + def test_from_pipe_consistent_config(self): + assert self.original_pipeline_class == StableDiffusionPipeline + original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe" + original_kwargs = {"requires_safety_checker": False} + + # create original_pipeline_class(sd) + pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs) + + # original_pipeline_class(sd) -> pipeline_class + pipe_components = self.get_dummy_components() + pipe_additional_components = {} + for name, component in pipe_components.items(): + if name not in pipe_original.components: + pipe_additional_components[name] = component + + pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components) + + # pipeline_class -> original_pipeline_class(sd) + original_pipe_additional_components = {} + for name, component in pipe_original.components.items(): + if name not in pipe.components or not isinstance(component, pipe.components[name].__class__): + original_pipe_additional_components[name] = component + + pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components) + + # compare the config + original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")} + original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")} + assert original_config_2 == original_config + def test_motion_unet_loading(self): components = self.get_dummy_components() pipe = AnimateDiffVideoToVideoPipeline(**components) @@ -143,24 +181,24 @@ def test_ip_adapter_single(self): if torch_device == "cpu": expected_pipe_slice = np.array( [ - 0.4947, - 0.4780, - 0.4340, - 0.4666, - 0.4028, - 0.4645, - 0.4915, - 0.4101, - 0.4308, - 0.4581, - 0.3582, - 0.4953, - 0.4466, - 0.5348, - 0.5863, - 0.5299, + 0.5569, + 0.6250, + 0.4145, + 0.5613, + 0.5563, 0.5213, - 0.5017, + 0.5092, + 0.4950, + 0.4950, + 0.5685, + 0.3858, + 0.4864, + 0.6458, + 0.4312, + 0.5518, + 0.5608, + 0.4418, + 0.5378, ] ) return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice) @@ -266,7 +304,7 @@ def test_prompt_embeds(self): inputs = self.get_dummy_inputs(torch_device) inputs.pop("prompt") - inputs["prompt_embeds"] = torch.randn((1, 4, 32), device=torch_device) + inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device) pipe(**inputs) def test_latent_inputs(self): @@ -276,7 +314,8 @@ def test_latent_inputs(self): pipe.to(torch_device) inputs = self.get_dummy_inputs(torch_device) - inputs["latents"] = torch.randn((1, 4, 1, 32, 32), device=torch_device) + sample_size = pipe.unet.config.sample_size + inputs["latents"] = torch.randn((1, 4, 1, sample_size, sample_size), device=torch_device) inputs.pop("video") pipe(**inputs) @@ -343,3 +382,49 @@ def test_free_init(self): 1e-4, "Disabling of FreeInit should lead to results similar to the default pipeline results", ) + + def test_free_init_with_schedulers(self): + components = self.get_dummy_components() + pipe: AnimateDiffVideoToVideoPipeline = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + pipe.to(torch_device) + + inputs_normal = self.get_dummy_inputs(torch_device) + frames_normal = pipe(**inputs_normal).frames[0] + + schedulers_to_test = [ + DPMSolverMultistepScheduler.from_config( + components["scheduler"].config, + timestep_spacing="linspace", + beta_schedule="linear", + algorithm_type="dpmsolver++", + steps_offset=1, + clip_sample=False, + ), + LCMScheduler.from_config( + components["scheduler"].config, + timestep_spacing="linspace", + beta_schedule="linear", + steps_offset=1, + clip_sample=False, + ), + ] + components.pop("scheduler") + + for scheduler in schedulers_to_test: + components["scheduler"] = scheduler + pipe: AnimateDiffVideoToVideoPipeline = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + pipe.to(torch_device) + + pipe.enable_free_init(num_iters=2, use_fast_sampling=False) + + inputs = self.get_dummy_inputs(torch_device) + frames_enable_free_init = pipe(**inputs).frames[0] + sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum() + + self.assertGreater( + sum_enabled, + 1e1, + "Enabling of FreeInit should lead to results different from the default pipeline results", + ) diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py index 08fd361940a9..fb550dd3219d 100644 --- a/tests/pipelines/audioldm2/test_audioldm2.py +++ b/tests/pipelines/audioldm2/test_audioldm2.py @@ -73,14 +73,15 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase): def get_dummy_components(self): torch.manual_seed(0) unet = AudioLDM2UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, + block_out_channels=(8, 16), + layers_per_block=1, + norm_num_groups=8, sample_size=32, in_channels=4, out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=([None, 16, 32], [None, 16, 32]), + cross_attention_dim=(8, 16), ) scheduler = DDIMScheduler( beta_start=0.00085, @@ -91,9 +92,10 @@ def get_dummy_components(self): ) torch.manual_seed(0) vae = AutoencoderKL( - block_out_channels=[32, 64], + block_out_channels=[8, 16], in_channels=1, out_channels=1, + norm_num_groups=8, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=4, @@ -102,32 +104,34 @@ def get_dummy_components(self): text_branch_config = ClapTextConfig( bos_token_id=0, eos_token_id=2, - hidden_size=16, + hidden_size=8, intermediate_size=37, layer_norm_eps=1e-05, - num_attention_heads=2, - num_hidden_layers=2, + num_attention_heads=1, + num_hidden_layers=1, pad_token_id=1, vocab_size=1000, - projection_dim=16, + projection_dim=8, ) audio_branch_config = ClapAudioConfig( - spec_size=64, + spec_size=8, window_size=4, - num_mel_bins=64, + num_mel_bins=8, intermediate_size=37, layer_norm_eps=1e-05, - depths=[2, 2], - num_attention_heads=[2, 2], - num_hidden_layers=2, + depths=[1, 1], + num_attention_heads=[1, 1], + num_hidden_layers=1, hidden_size=192, - projection_dim=16, + projection_dim=8, patch_size=2, patch_stride=2, patch_embed_input_channels=4, ) text_encoder_config = ClapConfig.from_text_audio_configs( - text_config=text_branch_config, audio_config=audio_branch_config, projection_dim=16 + text_config=text_branch_config, + audio_config=audio_branch_config, + projection_dim=16, ) text_encoder = ClapModel(text_encoder_config) tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77) @@ -141,8 +145,8 @@ def get_dummy_components(self): d_model=32, d_ff=37, d_kv=8, - num_heads=2, - num_layers=2, + num_heads=1, + num_layers=1, ) text_encoder_2 = T5EncoderModel(text_encoder_2_config) tokenizer_2 = T5Tokenizer.from_pretrained("hf-internal-testing/tiny-random-T5Model", model_max_length=77) @@ -150,8 +154,8 @@ def get_dummy_components(self): torch.manual_seed(0) language_model_config = GPT2Config( n_embd=16, - n_head=2, - n_layer=2, + n_head=1, + n_layer=1, vocab_size=1000, n_ctx=99, n_positions=99, @@ -160,7 +164,11 @@ def get_dummy_components(self): language_model.config.max_new_tokens = 8 torch.manual_seed(0) - projection_model = AudioLDM2ProjectionModel(text_encoder_dim=16, text_encoder_1_dim=32, langauge_model_dim=16) + projection_model = AudioLDM2ProjectionModel( + text_encoder_dim=16, + text_encoder_1_dim=32, + langauge_model_dim=16, + ) vocoder_config = SpeechT5HifiGanConfig( model_in_dim=8, @@ -220,7 +228,18 @@ def test_audioldm2_ddim(self): audio_slice = audio[:10] expected_slice = np.array( - [0.0025, 0.0018, 0.0018, -0.0023, -0.0026, -0.0020, -0.0026, -0.0021, -0.0027, -0.0020] + [ + 2.602e-03, + 1.729e-03, + 1.863e-03, + -2.219e-03, + -2.656e-03, + -2.017e-03, + -2.648e-03, + -2.115e-03, + -2.502e-03, + -2.081e-03, + ] ) assert np.abs(audio_slice - expected_slice).max() < 1e-4 @@ -361,7 +380,7 @@ def test_audioldm2_negative_prompt(self): audio_slice = audio[:10] expected_slice = np.array( - [0.0025, 0.0018, 0.0018, -0.0023, -0.0026, -0.0020, -0.0026, -0.0021, -0.0027, -0.0020] + [0.0026, 0.0017, 0.0018, -0.0022, -0.0026, -0.002, -0.0026, -0.0021, -0.0025, -0.0021] ) assert np.abs(audio_slice - expected_slice).max() < 1e-4 @@ -388,7 +407,7 @@ def test_audioldm2_num_waveforms_per_prompt(self): assert audios.shape == (batch_size, 256) # test num_waveforms_per_prompt for single prompt - num_waveforms_per_prompt = 2 + num_waveforms_per_prompt = 1 audios = audioldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios assert audios.shape == (num_waveforms_per_prompt, 256) diff --git a/tests/pipelines/aura_flow/test_pipeline_aura_dlow.py b/tests/pipelines/aura_flow/test_pipeline_aura_flow.py similarity index 100% rename from tests/pipelines/aura_flow/test_pipeline_aura_dlow.py rename to tests/pipelines/aura_flow/test_pipeline_aura_flow.py diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py index 3341f6704e75..d2c63137c99e 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py @@ -37,7 +37,12 @@ UNet2DConditionModel, ) from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, torch_device +from diffusers.utils.testing_utils import ( + enable_full_determinism, + floats_tensor, + require_torch_gpu, + torch_device, +) from ..pipeline_params import ( IMAGE_TO_IMAGE_IMAGE_PARAMS, @@ -228,12 +233,6 @@ def get_dummy_inputs(self, device, seed=0, img_res=64): def test_attention_slicing_forward_pass(self): return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3) - def test_dict_tuple_outputs_equivalent(self): - expected_slice = None - if torch_device == "cpu": - expected_slice = np.array([0.5490, 0.5053, 0.4676, 0.5816, 0.5364, 0.4830, 0.5937, 0.5719, 0.4318]) - super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) - @unittest.skipIf( torch_device != "cuda" or not is_xformers_available(), reason="XFormers attention is only available with CUDA and `xformers` installed", @@ -341,7 +340,8 @@ def test_controlnet_sdxl_guess(self): output = sd_pipe(**inputs) image_slice = output.images[0, -3:, -3:, -1] - expected_slice = np.array([0.549, 0.5053, 0.4676, 0.5816, 0.5364, 0.483, 0.5937, 0.5719, 0.4318]) + + expected_slice = np.array([0.5460, 0.4943, 0.4635, 0.5832, 0.5366, 0.4815, 0.6034, 0.5741, 0.4341]) # make sure that it's equal assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4 diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py index 4194b6510d17..6ee83cd6c92a 100644 --- a/tests/pipelines/controlnet/test_controlnet_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py @@ -195,7 +195,7 @@ def test_ip_adapter_single(self, from_ssd1b=False, expected_pipe_slice=None): expected_pipe_slice = None if torch_device == "cpu": expected_pipe_slice = np.array( - [0.7331, 0.5907, 0.5667, 0.6029, 0.5679, 0.5968, 0.4033, 0.4761, 0.5090] + [0.7335, 0.5866, 0.5623, 0.6242, 0.5751, 0.5999, 0.4091, 0.4590, 0.5054] ) return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice) @@ -348,9 +348,8 @@ def test_controlnet_sdxl_guess(self): output = sd_pipe(**inputs) image_slice = output.images[0, -3:, -3:, -1] - expected_slice = np.array( - [0.7330834, 0.590667, 0.5667336, 0.6029023, 0.5679491, 0.5968194, 0.4032986, 0.47612396, 0.5089609] - ) + + expected_slice = np.array([0.7335, 0.5866, 0.5623, 0.6242, 0.5751, 0.5999, 0.4091, 0.4590, 0.5054]) # make sure that it's equal assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4 @@ -371,7 +370,7 @@ def test_controlnet_sdxl_lcm(self): image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.7799, 0.614, 0.6162, 0.7082, 0.6662, 0.5833, 0.4148, 0.5182, 0.4866]) + expected_slice = np.array([0.7820, 0.6195, 0.6193, 0.7045, 0.6706, 0.5837, 0.4147, 0.5232, 0.4868]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 @@ -965,9 +964,8 @@ def test_controlnet_sdxl_guess(self): output = sd_pipe(**inputs) image_slice = output.images[0, -3:, -3:, -1] - expected_slice = np.array( - [0.6831671, 0.5702532, 0.5459845, 0.6299793, 0.58563006, 0.6033695, 0.4493941, 0.46132287, 0.5035841] - ) + + expected_slice = np.array([0.7212, 0.5890, 0.5491, 0.6425, 0.5970, 0.6091, 0.4418, 0.4556, 0.5032]) # make sure that it's equal assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4 @@ -975,7 +973,8 @@ def test_controlnet_sdxl_guess(self): def test_ip_adapter_single(self): expected_pipe_slice = None if torch_device == "cpu": - expected_pipe_slice = np.array([0.6832, 0.5703, 0.5460, 0.6300, 0.5856, 0.6034, 0.4494, 0.4613, 0.5036]) + expected_pipe_slice = np.array([0.7212, 0.5890, 0.5491, 0.6425, 0.5970, 0.6091, 0.4418, 0.4556, 0.5032]) + return super().test_ip_adapter_single(from_ssd1b=True, expected_pipe_slice=expected_pipe_slice) def test_controlnet_sdxl_lcm(self): @@ -994,7 +993,7 @@ def test_controlnet_sdxl_lcm(self): image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.6850, 0.5135, 0.5545, 0.7033, 0.6617, 0.5971, 0.4165, 0.5480, 0.5070]) + expected_slice = np.array([0.6787, 0.5117, 0.5558, 0.6963, 0.6571, 0.5928, 0.4121, 0.5468, 0.5057]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py b/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py index 8092e28c33e9..99ea395ad325 100644 --- a/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py @@ -178,7 +178,8 @@ def get_dummy_inputs(self, device, seed=0): def test_ip_adapter_single(self): expected_pipe_slice = None if torch_device == "cpu": - expected_pipe_slice = np.array([0.6265, 0.5441, 0.5384, 0.5446, 0.5810, 0.5908, 0.5414, 0.5428, 0.5353]) + expected_pipe_slice = np.array([0.6276, 0.5271, 0.5205, 0.5393, 0.5774, 0.5872, 0.5456, 0.5415, 0.5354]) + # TODO: update after slices.p return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice) def test_stable_diffusion_xl_controlnet_img2img(self): diff --git a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py index af5b38fafaf5..74cb56e0337a 100644 --- a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py +++ b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py @@ -180,11 +180,10 @@ def test_controlnet_sd3(self): image = output.images image_slice = image[0, -3:, -3:, -1] + assert image.shape == (1, 32, 32, 3) - expected_slice = np.array( - [0.5761719, 0.71777344, 0.59228516, 0.578125, 0.6020508, 0.39453125, 0.46728516, 0.51708984, 0.58984375] - ) + expected_slice = np.array([0.5767, 0.7100, 0.5981, 0.5674, 0.5952, 0.4102, 0.5093, 0.5044, 0.6030]) assert ( np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py b/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py index ad5f5f3ef2ca..653cb41e4bc4 100644 --- a/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py +++ b/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py @@ -36,7 +36,12 @@ ) from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineTesterMixin, to_np +from ..test_pipelines_common import ( + PipelineTesterMixin, + check_qkv_fusion_matches_attn_procs_length, + check_qkv_fusion_processors_exist, + to_np, +) enable_full_determinism() @@ -261,6 +266,16 @@ def test_fused_qkv_projections(self): original_image_slice = image[0, -3:, -3:, -1] pipe.transformer.fuse_qkv_projections() + # TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added + # to the pipeline level. + pipe.transformer.fuse_qkv_projections() + assert check_qkv_fusion_processors_exist( + pipe.transformer + ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused." + assert check_qkv_fusion_matches_attn_procs_length( + pipe.transformer, pipe.transformer.original_attn_processors + ), "Something wrong with the attention processors concerning the fused QKV projections." + inputs = self.get_dummy_inputs(device) inputs["return_dict"] = False image_fused = pipe(**inputs)[0] diff --git a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py index 426e25812295..592ebd35f4a9 100644 --- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py +++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py @@ -39,7 +39,6 @@ enable_full_determinism, floats_tensor, numpy_cosine_similarity_distance, - print_tensor_test, require_torch_gpu, skip_mps, slow, @@ -265,6 +264,5 @@ def test_i2vgen_xl(self): assert image.shape == (num_frames, 704, 1280, 3) image_slice = image[0, -3:, -3:, -1] - print_tensor_test(image_slice.flatten()) expected_slice = np.array([0.5482, 0.6244, 0.6274, 0.4584, 0.5935, 0.5937, 0.4579, 0.5767, 0.5892]) assert numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice.flatten()) < 1e-3 diff --git a/tests/pipelines/kandinsky/test_kandinsky_combined.py b/tests/pipelines/kandinsky/test_kandinsky_combined.py index 79c4336ecbdd..607a47e08e58 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py @@ -94,7 +94,7 @@ def test_kandinsky(self): assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.0000, 0.0000, 0.6777, 0.1363, 0.3624, 0.7868, 0.3869, 0.3395, 0.5068]) + expected_slice = np.array([0.2893, 0.1464, 0.4603, 0.3529, 0.4612, 0.7701, 0.4027, 0.3051, 0.5155]) assert ( np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 @@ -200,7 +200,7 @@ def test_kandinsky(self): assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4260, 0.3596, 0.4571, 0.3890, 0.4087, 0.5137, 0.4819, 0.4116, 0.5053]) + expected_slice = np.array([0.4852, 0.4136, 0.4539, 0.4781, 0.4680, 0.5217, 0.4973, 0.4089, 0.4977]) assert ( np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 @@ -305,11 +305,14 @@ def test_kandinsky(self): )[0] image_slice = image[0, -3:, -3:, -1] + image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] + print(image_from_tuple_slice) + assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.0477, 0.0808, 0.2972, 0.2705, 0.3620, 0.6247, 0.4464, 0.2870, 0.3530]) + expected_slice = np.array([0.0320, 0.0860, 0.4013, 0.0518, 0.2484, 0.5847, 0.4411, 0.2321, 0.4593]) assert ( np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py index 8e5456ba0e8d..5f42447bd9d5 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_prior.py +++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py @@ -211,12 +211,13 @@ def test_kandinsky_prior(self): )[0] image_slice = image[0, -10:] + image_from_tuple_slice = image_from_tuple[0, -10:] assert image.shape == (1, 32) expected_slice = np.array( - [-0.0532, 1.7120, 0.3656, -1.0852, -0.8946, -1.1756, 0.4348, 0.2482, 0.5146, -0.1156] + [-0.5948, 0.1875, -0.1523, -1.1995, -1.4061, -0.6367, -1.4607, -0.6406, 0.8793, -0.3891] ) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py index 40bb3b0522c9..dbba0831397b 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py @@ -99,7 +99,7 @@ def test_kandinsky(self): assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.3013, 0.0471, 0.5176, 0.1817, 0.2566, 0.7076, 0.6712, 0.4421, 0.7503]) + expected_slice = np.array([0.3076, 0.2729, 0.5668, 0.0522, 0.3384, 0.7028, 0.4908, 0.3659, 0.6243]) assert ( np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 @@ -221,7 +221,7 @@ def test_kandinsky(self): assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4353, 0.4710, 0.5128, 0.4806, 0.5054, 0.5348, 0.5224, 0.4603, 0.5025]) + expected_slice = np.array([0.4445, 0.4287, 0.4596, 0.3919, 0.3730, 0.5039, 0.4834, 0.4269, 0.5521]) assert ( np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py b/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py index c19c574bc314..be0bc238d4da 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py @@ -213,12 +213,13 @@ def test_kandinsky_prior(self): )[0] image_slice = image[0, -10:] + image_from_tuple_slice = image_from_tuple[0, -10:] assert image.shape == (1, 32) expected_slice = np.array( - [-0.0532, 1.7120, 0.3656, -1.0852, -0.8946, -1.1756, 0.4348, 0.2482, 0.5146, -0.1156] + [-0.5948, 0.1875, -0.1523, -1.1995, -1.4061, -0.6367, -1.4607, -0.6406, 0.8793, -0.3891] ) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py b/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py index 7d66fb9bb5a1..e898824e2d17 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py @@ -30,7 +30,12 @@ ) from diffusers import KandinskyV22PriorEmb2EmbPipeline, PriorTransformer, UnCLIPScheduler -from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, skip_mps, torch_device +from diffusers.utils.testing_utils import ( + enable_full_determinism, + floats_tensor, + skip_mps, + torch_device, +) from ..test_pipelines_common import PipelineTesterMixin @@ -210,23 +215,13 @@ def test_kandinsky_prior_emb2emb(self): )[0] image_slice = image[0, -10:] + image_from_tuple_slice = image_from_tuple[0, -10:] assert image.shape == (1, 32) expected_slice = np.array( - [ - 0.1071284, - 1.3330271, - 0.61260223, - -0.6691065, - -0.3846852, - -1.0303661, - 0.22716111, - 0.03348901, - 0.30040675, - -0.24805029, - ] + [-0.8947, 0.7225, -0.2400, -1.4224, -1.9268, -1.1454, -1.8220, -0.7972, 1.0465, -0.5207] ) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/pipelines/kolors/test_kolors.py b/tests/pipelines/kolors/test_kolors.py index ba2156e4e8ac..3f7fcaf59575 100644 --- a/tests/pipelines/kolors/test_kolors.py +++ b/tests/pipelines/kolors/test_kolors.py @@ -96,6 +96,8 @@ def get_dummy_components(self, time_cond_proj_dim=None): "vae": vae, "text_encoder": text_encoder, "tokenizer": tokenizer, + "image_encoder": None, + "feature_extractor": None, } return components @@ -132,8 +134,10 @@ def test_inference(self): max_diff = np.abs(image_slice.flatten() - expected_slice).max() self.assertLessEqual(max_diff, 1e-3) - # should skip it but pipe._optional_components = [] so it doesn't + # throws AttributeError: property 'eos_token' of 'ChatGLMTokenizer' object has no setter + # not sure if it is worth to fix it before integrating it to transformers def test_save_load_optional_components(self): + # TODO (Alvaro) need to fix later pass # throws AttributeError: property 'eos_token' of 'ChatGLMTokenizer' object has no setter diff --git a/tests/pipelines/pag/test_pag_controlnet_sd.py b/tests/pipelines/pag/test_pag_controlnet_sd.py index 1f089475040f..8a7eb6f0c675 100644 --- a/tests/pipelines/pag/test_pag_controlnet_sd.py +++ b/tests/pipelines/pag/test_pag_controlnet_sd.py @@ -173,7 +173,7 @@ def test_pag_disable_enable(self): del inputs["pag_scale"] assert ( "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters - ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__calss__.__name__}." + ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}." out = pipe_sd(**inputs).images[0, -3:, -3:, -1] # pag disabled with pag_scale=0.0 diff --git a/tests/pipelines/pag/test_pag_controlnet_sdxl.py b/tests/pipelines/pag/test_pag_controlnet_sdxl.py index f42afb7cea47..6400cc2b7cab 100644 --- a/tests/pipelines/pag/test_pag_controlnet_sdxl.py +++ b/tests/pipelines/pag/test_pag_controlnet_sdxl.py @@ -28,9 +28,7 @@ StableDiffusionXLControlNetPipeline, UNet2DConditionModel, ) -from diffusers.utils.testing_utils import ( - enable_full_determinism, -) +from diffusers.utils.testing_utils import enable_full_determinism from diffusers.utils.torch_utils import randn_tensor from ..pipeline_params import ( @@ -193,7 +191,7 @@ def test_pag_disable_enable(self): del inputs["pag_scale"] assert ( "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters - ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__calss__.__name__}." + ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}." out = pipe_sd(**inputs).images[0, -3:, -3:, -1] # pag disabled with pag_scale=0.0 @@ -237,9 +235,7 @@ def test_pag_cfg(self): 64, 3, ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}" - expected_slice = np.array( - [0.6819614, 0.5551478, 0.5499094, 0.5769566, 0.53942275, 0.5707505, 0.41131154, 0.47833863, 0.49982738] - ) + expected_slice = np.array([0.7036, 0.5613, 0.5526, 0.6129, 0.5610, 0.5842, 0.4228, 0.4612, 0.5017]) max_diff = np.abs(image_slice.flatten() - expected_slice).max() assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}" @@ -263,9 +259,7 @@ def test_pag_uncond(self): 64, 3, ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}" - expected_slice = np.array( - [0.66685176, 0.53207266, 0.5541569, 0.5912994, 0.5368312, 0.58433825, 0.42607725, 0.46805605, 0.5098659] - ) + expected_slice = np.array([0.6888, 0.5398, 0.5603, 0.6086, 0.5541, 0.5957, 0.4332, 0.4643, 0.5154]) max_diff = np.abs(image_slice.flatten() - expected_slice).max() assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}" diff --git a/tests/pipelines/pag/test_pag_sd.py b/tests/pipelines/pag/test_pag_sd.py index 201660d0e94f..a0930245b375 100644 --- a/tests/pipelines/pag/test_pag_sd.py +++ b/tests/pipelines/pag/test_pag_sd.py @@ -157,7 +157,7 @@ def test_pag_disable_enable(self): del inputs["pag_scale"] assert ( "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters - ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__calss__.__name__}." + ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}." out = pipe_sd(**inputs).images[0, -3:, -3:, -1] # pag disabled with pag_scale=0.0 diff --git a/tests/pipelines/pag/test_pag_sdxl.py b/tests/pipelines/pag/test_pag_sdxl.py index 9e607d42e8dc..5ec3dc5555f1 100644 --- a/tests/pipelines/pag/test_pag_sdxl.py +++ b/tests/pipelines/pag/test_pag_sdxl.py @@ -170,7 +170,7 @@ def test_pag_disable_enable(self): del inputs["pag_scale"] assert ( "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters - ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__calss__.__name__}." + ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}." out = pipe_sd(**inputs).images[0, -3:, -3:, -1] # pag disabled with pag_scale=0.0 @@ -283,9 +283,7 @@ def test_pag_inference(self): 64, 3, ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}" - expected_slice = np.array( - [0.55341685, 0.55503535, 0.47299808, 0.43274558, 0.4965323, 0.46310428, 0.51455414, 0.5015592, 0.46913484] - ) + expected_slice = np.array([0.5382, 0.5439, 0.4704, 0.4569, 0.5234, 0.4834, 0.5289, 0.5039, 0.4764]) max_diff = np.abs(image_slice.flatten() - expected_slice).max() self.assertLessEqual(max_diff, 1e-3) diff --git a/tests/pipelines/pag/test_pag_sdxl_img2img.py b/tests/pipelines/pag/test_pag_sdxl_img2img.py index e62c4ccf370d..7e5fc5fa28b9 100644 --- a/tests/pipelines/pag/test_pag_sdxl_img2img.py +++ b/tests/pipelines/pag/test_pag_sdxl_img2img.py @@ -260,9 +260,7 @@ def test_pag_inference(self): 32, 3, ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}" - expected_slice = np.array( - [0.46703637, 0.4917526, 0.44394222, 0.6895079, 0.56251144, 0.45474228, 0.5957122, 0.6016377, 0.5276273] - ) + expected_slice = np.array([0.4613, 0.4902, 0.4406, 0.6788, 0.5611, 0.4529, 0.5893, 0.5975, 0.5226]) max_diff = np.abs(image_slice.flatten() - expected_slice).max() assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}" diff --git a/tests/pipelines/pag/test_pag_sdxl_inpaint.py b/tests/pipelines/pag/test_pag_sdxl_inpaint.py index 9385b1fe405a..efc37abd0682 100644 --- a/tests/pipelines/pag/test_pag_sdxl_inpaint.py +++ b/tests/pipelines/pag/test_pag_sdxl_inpaint.py @@ -265,9 +265,7 @@ def test_pag_inference(self): 64, 3, ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}" - expected_slice = np.array( - [0.8115454, 0.53986573, 0.5825281, 0.6028964, 0.67128646, 0.7046922, 0.6418713, 0.5933924, 0.5154763] - ) + expected_slice = np.array([0.8366, 0.5513, 0.6105, 0.6213, 0.6957, 0.7400, 0.6614, 0.6102, 0.5239]) max_diff = np.abs(image_slice.flatten() - expected_slice).max() assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}" diff --git a/tests/pipelines/pia/test_pia.py b/tests/pipelines/pia/test_pia.py index 4150903ac0b9..83f550f30b23 100644 --- a/tests/pipelines/pia/test_pia.py +++ b/tests/pipelines/pia/test_pia.py @@ -9,8 +9,11 @@ from diffusers import ( AutoencoderKL, DDIMScheduler, + DPMSolverMultistepScheduler, + LCMScheduler, MotionAdapter, PIAPipeline, + StableDiffusionPipeline, UNet2DConditionModel, UNetMotionModel, ) @@ -54,16 +57,19 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr ) def get_dummy_components(self): + cross_attention_dim = 8 + block_out_channels = (8, 8) + torch.manual_seed(0) unet = UNet2DConditionModel( - block_out_channels=(32, 64), + block_out_channels=block_out_channels, layers_per_block=2, - sample_size=32, + sample_size=8, in_channels=4, out_channels=4, down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, + cross_attention_dim=cross_attention_dim, norm_num_groups=2, ) scheduler = DDIMScheduler( @@ -74,18 +80,19 @@ def get_dummy_components(self): ) torch.manual_seed(0) vae = AutoencoderKL( - block_out_channels=[32, 64], + block_out_channels=block_out_channels, in_channels=3, out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=4, + norm_num_groups=2, ) torch.manual_seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, eos_token_id=2, - hidden_size=32, + hidden_size=cross_attention_dim, intermediate_size=37, layer_norm_eps=1e-05, num_attention_heads=4, @@ -95,8 +102,9 @@ def get_dummy_components(self): ) text_encoder = CLIPTextModel(text_encoder_config) tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + torch.manual_seed(0) motion_adapter = MotionAdapter( - block_out_channels=(32, 64), + block_out_channels=block_out_channels, motion_layers_per_block=2, motion_norm_num_groups=2, motion_num_attention_heads=4, @@ -121,7 +129,7 @@ def get_dummy_inputs(self, device, seed=0): else: generator = torch.Generator(device=device).manual_seed(seed) - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + image = floats_tensor((1, 3, 8, 8), rng=random.Random(seed)).to(device) inputs = { "image": image, "prompt": "A painting of a squirrel eating a burger", @@ -132,6 +140,36 @@ def get_dummy_inputs(self, device, seed=0): } return inputs + def test_from_pipe_consistent_config(self): + assert self.original_pipeline_class == StableDiffusionPipeline + original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe" + original_kwargs = {"requires_safety_checker": False} + + # create original_pipeline_class(sd) + pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs) + + # original_pipeline_class(sd) -> pipeline_class + pipe_components = self.get_dummy_components() + pipe_additional_components = {} + for name, component in pipe_components.items(): + if name not in pipe_original.components: + pipe_additional_components[name] = component + + pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components) + + # pipeline_class -> original_pipeline_class(sd) + original_pipe_additional_components = {} + for name, component in pipe_original.components.items(): + if name not in pipe.components or not isinstance(component, pipe.components[name].__class__): + original_pipe_additional_components[name] = component + + pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components) + + # compare the config + original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")} + original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")} + assert original_config_2 == original_config + def test_motion_unet_loading(self): components = self.get_dummy_components() pipe = self.pipeline_class(**components) @@ -144,33 +182,33 @@ def test_ip_adapter_single(self): if torch_device == "cpu": expected_pipe_slice = np.array( [ - 0.5609, - 0.5756, - 0.4830, - 0.4420, - 0.4547, - 0.5129, - 0.3779, - 0.4042, - 0.3772, - 0.4450, - 0.5710, - 0.5536, - 0.4835, - 0.4308, - 0.5578, - 0.5578, - 0.4395, + 0.5475, + 0.5769, + 0.4873, + 0.5064, + 0.4445, + 0.5876, + 0.5453, + 0.4102, + 0.5247, + 0.5370, + 0.3406, + 0.4322, + 0.3991, + 0.3756, + 0.5438, + 0.4780, + 0.5087, + 0.5248, + 0.6243, + 0.5506, + 0.3491, 0.5440, - 0.6051, - 0.4651, - 0.6258, - 0.5662, - 0.3988, - 0.5108, - 0.4153, - 0.3993, - 0.4803, + 0.6111, + 0.5122, + 0.5326, + 0.5180, + 0.5538, ] ) return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice) @@ -178,7 +216,7 @@ def test_ip_adapter_single(self): def test_dict_tuple_outputs_equivalent(self): expected_slice = None if torch_device == "cpu": - expected_slice = np.array([0.3740, 0.4284, 0.4038, 0.5417, 0.4405, 0.5521, 0.4273, 0.4124, 0.4997]) + expected_slice = np.array([0.5476, 0.4092, 0.5289, 0.4755, 0.5092, 0.5186, 0.5403, 0.5287, 0.5467]) return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) @unittest.skip("Attention slicing is not enabled in this pipeline") @@ -286,7 +324,7 @@ def test_prompt_embeds(self): inputs = self.get_dummy_inputs(torch_device) inputs.pop("prompt") - inputs["prompt_embeds"] = torch.randn((1, 4, 32), device=torch_device) + inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device) pipe(**inputs) def test_free_init(self): @@ -324,6 +362,52 @@ def test_free_init(self): "Disabling of FreeInit should lead to results similar to the default pipeline results", ) + def test_free_init_with_schedulers(self): + components = self.get_dummy_components() + pipe: PIAPipeline = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + pipe.to(torch_device) + + inputs_normal = self.get_dummy_inputs(torch_device) + frames_normal = pipe(**inputs_normal).frames[0] + + schedulers_to_test = [ + DPMSolverMultistepScheduler.from_config( + components["scheduler"].config, + timestep_spacing="linspace", + beta_schedule="linear", + algorithm_type="dpmsolver++", + steps_offset=1, + clip_sample=False, + ), + LCMScheduler.from_config( + components["scheduler"].config, + timestep_spacing="linspace", + beta_schedule="linear", + steps_offset=1, + clip_sample=False, + ), + ] + components.pop("scheduler") + + for scheduler in schedulers_to_test: + components["scheduler"] = scheduler + pipe: PIAPipeline = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + pipe.to(torch_device) + + pipe.enable_free_init(num_iters=2, use_fast_sampling=False) + + inputs = self.get_dummy_inputs(torch_device) + frames_enable_free_init = pipe(**inputs).frames[0] + sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum() + + self.assertGreater( + sum_enabled, + 1e1, + "Enabling of FreeInit should lead to results different from the default pipeline results", + ) + @unittest.skipIf( torch_device != "cuda" or not is_xformers_available(), reason="XFormers attention is only available with CUDA and `xformers` installed", diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py index 39d1ead17ef1..6cf643fe47a6 100644 --- a/tests/pipelines/shap_e/test_shap_e.py +++ b/tests/pipelines/shap_e/test_shap_e.py @@ -181,7 +181,7 @@ def test_shap_e(self): assert image.shape == (32, 16) - expected_slice = np.array([-1.0000, -0.6241, 1.0000, -0.8978, -0.6866, 0.7876, -0.7473, -0.2874, 0.6103]) + expected_slice = np.array([-1.0000, -0.6559, 1.0000, -0.9096, -0.7252, 0.8211, -0.7647, -0.3308, 0.6462]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 def test_inference_batch_consistent(self): diff --git a/tests/pipelines/stable_cascade/test_stable_cascade_prior.py b/tests/pipelines/stable_cascade/test_stable_cascade_prior.py index c9ae18892156..0208224a1d80 100644 --- a/tests/pipelines/stable_cascade/test_stable_cascade_prior.py +++ b/tests/pipelines/stable_cascade/test_stable_cascade_prior.py @@ -168,22 +168,12 @@ def test_wuerstchen_prior(self): image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)[0] image_slice = image[0, 0, 0, -10:] + image_from_tuple_slice = image_from_tuple[0, 0, 0, -10:] assert image.shape == (1, 16, 24, 24) expected_slice = np.array( - [ - 96.139565, - -20.213179, - -116.40341, - -191.57129, - 39.350136, - 74.80767, - 39.782352, - -184.67352, - -46.426907, - 168.41783, - ] + [94.5498, -21.9481, -117.5025, -192.8760, 38.0117, 73.4709, 38.1142, -185.5593, -47.7869, 167.2853] ) assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2 diff --git a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py index 93e740145477..75a7d88ea4f2 100644 --- a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py +++ b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py @@ -13,7 +13,11 @@ torch_device, ) -from ..test_pipelines_common import PipelineTesterMixin +from ..test_pipelines_common import ( + PipelineTesterMixin, + check_qkv_fusion_matches_attn_procs_length, + check_qkv_fusion_processors_exist, +) class StableDiffusion3PipelineFastTests(unittest.TestCase, PipelineTesterMixin): @@ -191,7 +195,16 @@ def test_fused_qkv_projections(self): image = pipe(**inputs).images original_image_slice = image[0, -3:, -3:, -1] + # TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added + # to the pipeline level. pipe.transformer.fuse_qkv_projections() + assert check_qkv_fusion_processors_exist( + pipe.transformer + ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused." + assert check_qkv_fusion_matches_attn_procs_length( + pipe.transformer, pipe.transformer.original_attn_processors + ), "Something wrong with the attention processors concerning the fused QKV projections." + inputs = self.get_dummy_inputs(device) image = pipe(**inputs).images image_slice_fused = image[0, -3:, -3:, -1] diff --git a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py index f707c15cdbb4..7a3b0f70ccb1 100644 --- a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py @@ -133,7 +133,7 @@ def test_stable_diffusion_img_variation_default_case(self): image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5239, 0.5723, 0.4796, 0.5049, 0.5550, 0.4685, 0.5329, 0.4891, 0.4921]) + expected_slice = np.array([0.5348, 0.5924, 0.4798, 0.5237, 0.5741, 0.4651, 0.5344, 0.4942, 0.4851]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 @@ -153,7 +153,7 @@ def test_stable_diffusion_img_variation_multiple_images(self): image_slice = image[-1, -3:, -3:, -1] assert image.shape == (2, 64, 64, 3) - expected_slice = np.array([0.6892, 0.5637, 0.5836, 0.5771, 0.6254, 0.6409, 0.5580, 0.5569, 0.5289]) + expected_slice = np.array([0.6647, 0.5557, 0.5723, 0.5567, 0.5869, 0.6044, 0.5502, 0.5439, 0.5189]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 @@ -205,7 +205,7 @@ def test_stable_diffusion_img_variation_pipeline_default(self): image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.8449, 0.9079, 0.7571, 0.7873, 0.8348, 0.7010, 0.6694, 0.6873, 0.6138]) + expected_slice = np.array([0.5348, 0.5924, 0.4798, 0.5237, 0.5741, 0.4651, 0.5344, 0.4942, 0.4851]) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 1e-4 @@ -221,7 +221,7 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([-0.7974, -0.4343, -1.087, 0.04785, -1.327, 0.855, -2.148, -0.1725, 1.439]) + expected_slice = np.array([0.5348, 0.5924, 0.4798, 0.5237, 0.5741, 0.4651, 0.5344, 0.4942, 0.4851]) max_diff = numpy_cosine_similarity_distance(latents_slice.flatten(), expected_slice) assert max_diff < 1e-3 @@ -230,7 +230,7 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([0.3232, 0.004883, 0.913, -1.084, 0.6143, -1.6875, -2.463, -0.439, -0.419]) + expected_slice = np.array([0.5348, 0.5924, 0.4798, 0.5237, 0.5741, 0.4651, 0.5344, 0.4942, 0.4851]) max_diff = numpy_cosine_similarity_distance(latents_slice.flatten(), expected_slice) assert max_diff < 1e-3 diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py index bf2e5ff4607a..08cf6c1dc35f 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py @@ -174,7 +174,7 @@ def test_stable_diffusion_xl_euler(self): image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5552, 0.5569, 0.4725, 0.4348, 0.4994, 0.4632, 0.5142, 0.5012, 0.47]) + expected_slice = np.array([0.5388, 0.5452, 0.4694, 0.4583, 0.5253, 0.4832, 0.5288, 0.5035, 0.47]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 @@ -333,7 +333,8 @@ def test_stable_diffusion_xl_negative_prompt_embeds(self): def test_ip_adapter_single(self): expected_pipe_slice = None if torch_device == "cpu": - expected_pipe_slice = np.array([0.5552, 0.5569, 0.4725, 0.4348, 0.4994, 0.4632, 0.5142, 0.5012, 0.4700]) + expected_pipe_slice = np.array([0.5388, 0.5452, 0.4694, 0.4583, 0.5253, 0.4832, 0.5288, 0.5035, 0.4766]) + return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice) def test_attention_slicing_forward_pass(self): diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py index 7093ed46d07d..2bc8143fee6a 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py @@ -295,8 +295,9 @@ def test_ip_adapter_single(self, from_multi=False, expected_pipe_slice=None): expected_pipe_slice = None if torch_device == "cpu": expected_pipe_slice = np.array( - [0.5753, 0.6022, 0.4728, 0.4986, 0.5708, 0.4645, 0.5194, 0.5134, 0.4730] + [0.5752, 0.6155, 0.4826, 0.5111, 0.5741, 0.4678, 0.5199, 0.5231, 0.4794] ) + return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice) def test_stable_diffusion_adapter_default_case(self): @@ -311,9 +312,7 @@ def test_stable_diffusion_adapter_default_case(self): image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array( - [0.5752919, 0.6022097, 0.4728038, 0.49861962, 0.57084894, 0.4644975, 0.5193715, 0.5133664, 0.4729858] - ) + expected_slice = np.array([00.5752, 0.6155, 0.4826, 0.5111, 0.5741, 0.4678, 0.5199, 0.5231, 0.4794]) assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3 @parameterized.expand( @@ -446,15 +445,14 @@ def test_stable_diffusion_adapter_default_case(self): image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array( - [0.5813032, 0.60995954, 0.47563356, 0.5056669, 0.57199144, 0.4631841, 0.5176794, 0.51252556, 0.47183886] - ) + expected_slice = np.array([0.5617, 0.6081, 0.4807, 0.5071, 0.5665, 0.4614, 0.5165, 0.5164, 0.4786]) assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3 def test_ip_adapter_single(self): expected_pipe_slice = None if torch_device == "cpu": - expected_pipe_slice = np.array([0.5813, 0.6100, 0.4756, 0.5057, 0.5720, 0.4632, 0.5177, 0.5125, 0.4718]) + expected_pipe_slice = np.array([0.5617, 0.6081, 0.4807, 0.5071, 0.5665, 0.4614, 0.5165, 0.5164, 0.4786]) + return super().test_ip_adapter_single(from_multi=True, expected_pipe_slice=expected_pipe_slice) def test_inference_batch_consistent( diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py index 5b86dd08963e..b160eb41b7d7 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py @@ -313,7 +313,8 @@ def test_stable_diffusion_xl_img2img_negative_prompt_embeds(self): def test_ip_adapter_single(self): expected_pipe_slice = None if torch_device == "cpu": - expected_pipe_slice = np.array([0.5174, 0.4512, 0.5006, 0.6273, 0.5160, 0.6825, 0.6655, 0.5840, 0.5675]) + expected_pipe_slice = np.array([0.5133, 0.4626, 0.4970, 0.6273, 0.5160, 0.6891, 0.6639, 0.5892, 0.5709]) + return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice) def test_stable_diffusion_xl_img2img_tiny_autoencoder(self): diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py index d096096e0bd2..089e478836a5 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py @@ -226,7 +226,8 @@ def get_dummy_inputs_2images(self, device, seed=0, img_res=64): def test_ip_adapter_single(self): expected_pipe_slice = None if torch_device == "cpu": - expected_pipe_slice = np.array([0.7971, 0.5371, 0.5973, 0.5642, 0.6689, 0.6894, 0.5770, 0.6063, 0.5261]) + expected_pipe_slice = np.array([0.8274, 0.5538, 0.6141, 0.5843, 0.6865, 0.7082, 0.5861, 0.6123, 0.5344]) + return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice) def test_components_function(self): @@ -250,7 +251,7 @@ def test_stable_diffusion_xl_inpaint_euler(self): assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.8029, 0.5523, 0.5825, 0.6003, 0.6702, 0.7018, 0.6369, 0.5955, 0.5123]) + expected_slice = np.array([0.8279, 0.5673, 0.6088, 0.6156, 0.6923, 0.7347, 0.6547, 0.6108, 0.5198]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 @@ -385,7 +386,7 @@ def test_stable_diffusion_xl_refiner(self): assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.7045, 0.4838, 0.5454, 0.6270, 0.6168, 0.6717, 0.6484, 0.5681, 0.4922]) + expected_slice = np.array([0.7540, 0.5231, 0.5833, 0.6217, 0.6339, 0.7067, 0.6507, 0.5672, 0.5030]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py index 7ccdc515c68b..a5cbf7761501 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py @@ -182,7 +182,7 @@ def test_image_embeds_none(self): image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.3872, 0.7224, 0.5601, 0.4741, 0.6872, 0.5814, 0.4636, 0.3867, 0.5078]) + expected_slice = np.array([0.4397, 0.7080, 0.5590, 0.4255, 0.7181, 0.5938, 0.4051, 0.3720, 0.5116]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 15be3c8c519d..1d37ae1dc2ca 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -146,6 +146,7 @@ def __init__(self, encoder: CustomEncoder, scheduler: DDIMScheduler): class DownloadTests(unittest.TestCase): + @unittest.skip("Flaky behaviour on CI. Re-enable after migrating to new runners") def test_one_request_upon_cached(self): # TODO: For some reason this test fails on MPS where no HEAD call is made. if torch_device == "mps": @@ -191,6 +192,7 @@ def test_less_downloads_passed_object(self): assert "scheduler" in os.listdir(cached_folder) assert "feature_extractor" in os.listdir(cached_folder) + @unittest.skip("Flaky behaviour on CI. Re-enable after migrating to new runners") def test_less_downloads_passed_object_calls(self): # TODO: For some reason this test fails on MPS where no HEAD call is made. if torch_device == "mps": diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 8f2419db92e3..06fcc1c90b71 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -13,6 +13,7 @@ import numpy as np import PIL.Image import torch +import torch.nn as nn from huggingface_hub import ModelCard, delete_repo from huggingface_hub.utils import is_jinja_available from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer @@ -40,7 +41,12 @@ from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import logging from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available -from diffusers.utils.testing_utils import CaptureLogger, require_torch, skip_mps, torch_device +from diffusers.utils.testing_utils import ( + CaptureLogger, + require_torch, + skip_mps, + torch_device, +) from ..models.autoencoders.test_models_vae import ( get_asym_autoencoder_kl_config, @@ -67,6 +73,17 @@ def check_same_shape(tensor_list): return all(shape == shapes[0] for shape in shapes[1:]) +def check_qkv_fusion_matches_attn_procs_length(model, original_attn_processors): + current_attn_processors = model.attn_processors + return len(current_attn_processors) == len(original_attn_processors) + + +def check_qkv_fusion_processors_exist(model): + current_attn_processors = model.attn_processors + proc_names = [v.__class__.__name__ for _, v in current_attn_processors.items()] + return all(p.startswith("Fused") for p in proc_names) + + class SDFunctionTesterMixin: """ This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. @@ -196,6 +213,19 @@ def test_fused_qkv_projections(self): original_image_slice = image[0, -3:, -3:, -1] pipe.fuse_qkv_projections() + for _, component in pipe.components.items(): + if ( + isinstance(component, nn.Module) + and hasattr(component, "original_attn_processors") + and component.original_attn_processors is not None + ): + assert check_qkv_fusion_processors_exist( + component + ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused." + assert check_qkv_fusion_matches_attn_procs_length( + component, component.original_attn_processors + ), "Something wrong with the attention processors concerning the fused QKV projections." + inputs = self.get_dummy_inputs(device) inputs["return_dict"] = False image_fused = pipe(**inputs)[0] @@ -1351,14 +1381,24 @@ def _test_attention_slicing_forward_pass( pipe.enable_attention_slicing(slice_size=1) inputs = self.get_dummy_inputs(generator_device) - output_with_slicing = pipe(**inputs)[0] + output_with_slicing1 = pipe(**inputs)[0] + + pipe.enable_attention_slicing(slice_size=2) + inputs = self.get_dummy_inputs(generator_device) + output_with_slicing2 = pipe(**inputs)[0] if test_max_difference: - max_diff = np.abs(to_np(output_with_slicing) - to_np(output_without_slicing)).max() - self.assertLess(max_diff, expected_max_diff, "Attention slicing should not affect the inference results") + max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max() + max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max() + self.assertLess( + max(max_diff1, max_diff2), + expected_max_diff, + "Attention slicing should not affect the inference results", + ) if test_mean_pixel_difference: - assert_mean_pixel_difference(to_np(output_with_slicing[0]), to_np(output_without_slicing[0])) + assert_mean_pixel_difference(to_np(output_with_slicing1[0]), to_np(output_without_slicing[0])) + assert_mean_pixel_difference(to_np(output_with_slicing2[0]), to_np(output_without_slicing[0])) @unittest.skipIf( torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"), diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py index bcde20a36c34..8ba85455d3ab 100644 --- a/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py +++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py @@ -168,8 +168,12 @@ def test_text_to_video_zero_sdxl(self): first_frame_slice = result[0, -3:, -3:, -1] last_frame_slice = result[-1, -3:, -3:, 0] - expected_slice1 = np.array([0.48, 0.58, 0.53, 0.59, 0.50, 0.44, 0.60, 0.65, 0.52]) - expected_slice2 = np.array([0.66, 0.49, 0.40, 0.70, 0.47, 0.51, 0.73, 0.65, 0.52]) + expected_slice1 = np.array( + [0.6008109, 0.73051643, 0.51778656, 0.55817354, 0.45222935, 0.45998418, 0.57017255, 0.54874814, 0.47078788] + ) + expected_slice2 = np.array( + [0.6011751, 0.47420046, 0.41660714, 0.6472957, 0.41261768, 0.5438129, 0.7401535, 0.6756011, 0.53652245] + ) assert np.abs(first_frame_slice.flatten() - expected_slice1).max() < 1e-2 assert np.abs(last_frame_slice.flatten() - expected_slice2).max() < 1e-2 diff --git a/tests/schedulers/test_scheduler_dpm_single.py b/tests/schedulers/test_scheduler_dpm_single.py index ea43c210d650..873eaecd0a5c 100644 --- a/tests/schedulers/test_scheduler_dpm_single.py +++ b/tests/schedulers/test_scheduler_dpm_single.py @@ -194,16 +194,20 @@ def test_prediction_type(self): self.check_over_configs(prediction_type=prediction_type) def test_solver_order_and_type(self): - for algorithm_type in ["dpmsolver", "dpmsolver++"]: + for algorithm_type in ["dpmsolver", "dpmsolver++", "sde-dpmsolver++"]: for solver_type in ["midpoint", "heun"]: for order in [1, 2, 3]: for prediction_type in ["epsilon", "sample"]: - self.check_over_configs( - solver_order=order, - solver_type=solver_type, - prediction_type=prediction_type, - algorithm_type=algorithm_type, - ) + if algorithm_type == "sde-dpmsolver++": + if order == 3: + continue + else: + self.check_over_configs( + solver_order=order, + solver_type=solver_type, + prediction_type=prediction_type, + algorithm_type=algorithm_type, + ) sample = self.full_loop( solver_order=order, solver_type=solver_type, diff --git a/utils/fetch_torch_cuda_pipeline_test_matrix.py b/utils/fetch_torch_cuda_pipeline_test_matrix.py index 235d346d9306..e6a9c4b6a3bd 100644 --- a/utils/fetch_torch_cuda_pipeline_test_matrix.py +++ b/utils/fetch_torch_cuda_pipeline_test_matrix.py @@ -4,7 +4,7 @@ from collections import defaultdict from pathlib import Path -from huggingface_hub import HfApi, ModelFilter +from huggingface_hub import HfApi import diffusers @@ -27,7 +27,6 @@ logger = logging.getLogger(__name__) api = HfApi() -filter = ModelFilter(library="diffusers") def filter_pipelines(usage_dict, usage_cutoff=10000): @@ -46,7 +45,7 @@ def filter_pipelines(usage_dict, usage_cutoff=10000): def fetch_pipeline_objects(): - models = api.list_models(filter=filter) + models = api.list_models(library="diffusers") downloads = defaultdict(int) for model in models: diff --git a/scripts/log_reports.py b/utils/log_reports.py similarity index 100% rename from scripts/log_reports.py rename to utils/log_reports.py diff --git a/utils/overwrite_expected_slice.py b/utils/overwrite_expected_slice.py index 57177a432704..07778a05b1ee 100644 --- a/utils/overwrite_expected_slice.py +++ b/utils/overwrite_expected_slice.py @@ -76,7 +76,7 @@ def main(correct, fail=None): done_tests = defaultdict(int) for line in correct_lines: - file, class_name, test_name, correct_line = line.split(";") + file, class_name, test_name, correct_line = line.split("::") if test_failures is None or "::".join([file, class_name, test_name]) in test_failures: overwrite_file(file, class_name, test_name, correct_line, done_tests)