Merge pull request #174 from huggingface/main

Merge changes
Skquark · Aug 14, 2024 · 4ddce2a · 4ddce2a
2 parents ae5da35 + 0c1e63b
commit 4ddce2a
Show file tree

Hide file tree

Showing 79 changed files with 9,138 additions and 343 deletions.
diff --git a/README.md b/README.md
@@ -204,6 +204,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
 
 - https://github.com/microsoft/TaskMatrix
 - https://github.com/invoke-ai/InvokeAI
+- https://github.com/InstantID/InstantID
 - https://github.com/apple/ml-stable-diffusion
 - https://github.com/Sanster/lama-cleaner
 - https://github.com/IDEA-Research/Grounded-Segment-Anything

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -223,68 +223,76 @@
     sections:
     - local: api/models/overview
       title: Overview
-    - local: api/models/unet
-      title: UNet1DModel
-    - local: api/models/unet2d
-      title: UNet2DModel
-    - local: api/models/unet2d-cond
-      title: UNet2DConditionModel
-    - local: api/models/unet3d-cond
-      title: UNet3DConditionModel
-    - local: api/models/unet-motion
-      title: UNetMotionModel
-    - local: api/models/uvit2d
-      title: UViT2DModel
-    - local: api/models/vq
-      title: VQModel
-    - local: api/models/autoencoderkl
-      title: AutoencoderKL
-    - local: api/models/autoencoderkl_cogvideox
-      title: AutoencoderKLCogVideoX
-    - local: api/models/asymmetricautoencoderkl
-      title: AsymmetricAutoencoderKL
-    - local: api/models/stable_cascade_unet
-      title: StableCascadeUNet
-    - local: api/models/autoencoder_tiny
-      title: Tiny AutoEncoder
-    - local: api/models/autoencoder_oobleck
-      title: Oobleck AutoEncoder
-    - local: api/models/consistency_decoder_vae
-      title: ConsistencyDecoderVAE
-    - local: api/models/transformer2d
-      title: Transformer2DModel
-    - local: api/models/pixart_transformer2d
-      title: PixArtTransformer2DModel
-    - local: api/models/dit_transformer2d
-      title: DiTTransformer2DModel
-    - local: api/models/hunyuan_transformer2d
-      title: HunyuanDiT2DModel
-    - local: api/models/aura_flow_transformer2d
-      title: AuraFlowTransformer2DModel
-    - local: api/models/flux_transformer
-      title: FluxTransformer2DModel
-    - local: api/models/latte_transformer3d
-      title: LatteTransformer3DModel
-    - local: api/models/cogvideox_transformer3d
-      title: CogVideoXTransformer3DModel
-    - local: api/models/lumina_nextdit2d
-      title: LuminaNextDiT2DModel
-    - local: api/models/transformer_temporal
-      title: TransformerTemporalModel
-    - local: api/models/sd3_transformer2d
-      title: SD3Transformer2DModel
-    - local: api/models/stable_audio_transformer
-      title: StableAudioDiTModel
-    - local: api/models/prior_transformer
-      title: PriorTransformer
-    - local: api/models/controlnet
-      title: ControlNetModel
-    - local: api/models/controlnet_hunyuandit
-      title: HunyuanDiT2DControlNetModel
-    - local: api/models/controlnet_sd3
-      title: SD3ControlNetModel
-    - local: api/models/controlnet_sparsectrl
-      title: SparseControlNetModel
+    - sections:
+      - local: api/models/controlnet
+        title: ControlNetModel
+      - local: api/models/controlnet_hunyuandit
+        title: HunyuanDiT2DControlNetModel
+      - local: api/models/controlnet_sd3
+        title: SD3ControlNetModel
+      - local: api/models/controlnet_sparsectrl
+        title: SparseControlNetModel
+      title: ControlNets
+    - sections:
+      - local: api/models/aura_flow_transformer2d
+        title: AuraFlowTransformer2DModel
+      - local: api/models/cogvideox_transformer3d
+        title: CogVideoXTransformer3DModel
+      - local: api/models/dit_transformer2d
+        title: DiTTransformer2DModel
+      - local: api/models/flux_transformer
+        title: FluxTransformer2DModel
+      - local: api/models/hunyuan_transformer2d
+        title: HunyuanDiT2DModel
+      - local: api/models/latte_transformer3d
+        title: LatteTransformer3DModel
+      - local: api/models/lumina_nextdit2d
+        title: LuminaNextDiT2DModel
+      - local: api/models/pixart_transformer2d
+        title: PixArtTransformer2DModel
+      - local: api/models/prior_transformer
+        title: PriorTransformer
+      - local: api/models/sd3_transformer2d
+        title: SD3Transformer2DModel
+      - local: api/models/stable_audio_transformer
+        title: StableAudioDiTModel
+      - local: api/models/transformer2d
+        title: Transformer2DModel
+      - local: api/models/transformer_temporal
+        title: TransformerTemporalModel
+      title: Transformers
+    - sections:
+      - local: api/models/stable_cascade_unet
+        title: StableCascadeUNet
+      - local: api/models/unet
+        title: UNet1DModel
+      - local: api/models/unet2d
+        title: UNet2DModel
+      - local: api/models/unet2d-cond
+        title: UNet2DConditionModel
+      - local: api/models/unet3d-cond
+        title: UNet3DConditionModel
+      - local: api/models/unet-motion
+        title: UNetMotionModel
+      - local: api/models/uvit2d
+        title: UViT2DModel
+      title: UNets
+    - sections:
+      - local: api/models/autoencoderkl
+        title: AutoencoderKL
+      - local: api/models/autoencoderkl_cogvideox
+        title: AutoencoderKLCogVideoX
+      - local: api/models/asymmetricautoencoderkl
+        title: AsymmetricAutoencoderKL
+      - local: api/models/consistency_decoder_vae
+        title: ConsistencyDecoderVAE
+      - local: api/models/autoencoder_oobleck
+        title: Oobleck AutoEncoder
+      - local: api/models/autoencoder_tiny
+        title: Tiny AutoEncoder
+      - local: api/models/vq
+        title: VQModel
+      title: VAEs
     title: Models
   - isExpanded: false
     sections:

diff --git a/docs/source/en/api/pipelines/cogvideox.md b/docs/source/en/api/pipelines/cogvideox.md
@@ -15,9 +15,7 @@
 
 # CogVideoX
 
-<!-- TODO: update paper with ArXiv link when ready. -->
-
-[CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://github.com/THUDM/CogVideo/blob/main/resources/CogVideoX.pdf) from Tsinghua University & ZhipuAI.
+[CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer](https://arxiv.org/abs/2408.06072) from Tsinghua University & ZhipuAI, by Zhuoyi Yang, Jiayan Teng, Wendi Zheng, Ming Ding, Shiyu Huang, Jiazheng Xu, Yuanming Yang, Wenyi Hong, Xiaohan Zhang, Guanyu Feng, Da Yin, Xiaotao Gu, Yuxuan Zhang, Weihan Wang, Yean Cheng, Ting Liu, Bin Xu, Yuxiao Dong, Jie Tang.
 
 The abstract from the paper is:
 
@@ -43,43 +41,42 @@ from diffusers import CogVideoXPipeline
 from diffusers.utils import export_to_video
 
 pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b").to("cuda")
-prompt = (
-    "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
-    "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
-    "pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
-    "casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
-    "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
-    "atmosphere of this unique musical performance."
-)
-video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
-export_to_video(video, "output.mp4", fps=8)
 ```
 
-Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`:
+Then change the memory layout of the pipelines `transformer` component to `torch.channels_last`:
 
 ```python
-pipeline.transformer.to(memory_format=torch.channels_last)
-pipeline.vae.to(memory_format=torch.channels_last)
+pipe.transformer.to(memory_format=torch.channels_last)
 ```
 
 Finally, compile the components and run inference:
 
 ```python
-pipeline.transformer = torch.compile(pipeline.transformer)
-pipeline.vae.decode = torch.compile(pipeline.vae.decode)
+pipe.transformer = torch.compile(pipeline.transformer, mode="max-autotune", fullgraph=True)
 
-# CogVideoX works very well with long and well-described prompts
+# CogVideoX works well with long and well-described prompts
 prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."
-video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
+video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
 ```
 
-The [benchmark](TODO: link) results on an 80GB A100 machine are:
+The [benchmark](https://gist.github.com/a-r-r-o-w/5183d75e452a368fd17448fcc810bd3f) results on an 80GB A100 machine are:
 
 ```
-Without torch.compile(): Average inference time: TODO seconds.
-With torch.compile(): Average inference time: TODO seconds.
+Without torch.compile(): Average inference time: 96.89 seconds.
+With torch.compile(): Average inference time: 76.27 seconds.
 ```
 
+### Memory optimization
+
+CogVideoX requires about 19 GB of GPU memory to decode 49 frames (6 seconds of video at 8 FPS) with output resolution 720x480 (W x H), which makes it not possible to run on consumer GPUs or free-tier T4 Colab. The following memory optimizations could be used to reduce the memory footprint. For replication, you can refer to [this](https://gist.github.com/a-r-r-o-w/3959a03f15be5c9bd1fe545b09dfcc93) script.
+
+- `pipe.enable_model_cpu_offload()`:
+  - Without enabling cpu offloading, memory usage is `33 GB`
+  - With enabling cpu offloading, memory usage is `19 GB`
+- `pipe.vae.enable_tiling()`:
+  - With enabling cpu offloading and tiling, memory usage is `11 GB`
+- `pipe.vae.enable_slicing()`
+
 ## CogVideoXPipeline
 
 [[autodoc]] CogVideoXPipeline

diff --git a/docs/source/en/api/pipelines/controlnet_sd3.md b/docs/source/en/api/pipelines/controlnet_sd3.md
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team and The InstantX Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -22,7 +22,16 @@ The abstract from the paper is:
 
 *We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
 
-This code is implemented by [The InstantX Team](https://huggingface.co/InstantX). You can find pre-trained checkpoints for SD3-ControlNet on [The InstantX Team](https://huggingface.co/InstantX) Hub profile.
+This controlnet code is mainly implemented by [The InstantX Team](https://huggingface.co/InstantX). The inpainting-related code was developed by [The Alimama Creative Team](https://huggingface.co/alimama-creative). You can find pre-trained checkpoints for SD3-ControlNet in the table below: 
+
+
+| ControlNet type | Developer | Link |
+| -------- | ---------- | ---- |
+| Canny | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Canny) |
+| Pose | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Pose) |
+| Tile | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Tile) |
+| Inpainting | [The AlimamaCreative Team](https://huggingface.co/alimama-creative) | [link](https://huggingface.co/alimama-creative/SD3-Controlnet-Inpainting) |
+
 
 <Tip>
 
@@ -35,5 +44,10 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 	- all
 	- __call__
 
+## StableDiffusion3ControlNetInpaintingPipeline
+[[autodoc]] pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet_inpainting.StableDiffusion3ControlNetInpaintingPipeline
+	- all
+	- __call__
+
 ## StableDiffusion3PipelineOutput
 [[autodoc]] pipelines.stable_diffusion_3.pipeline_output.StableDiffusion3PipelineOutput
diff --git a/docs/source/en/api/pipelines/stable_audio.md b/docs/source/en/api/pipelines/stable_audio.md
@@ -21,7 +21,7 @@ Stable Audio is trained on a corpus of around 48k audio recordings, where around
 The abstract of the paper is the following:
 *Open generative models are vitally important for the community, allowing for fine-tunes and serving as baselines when presenting new models. However, most current text-to-audio models are private and not accessible for artists and researchers to build upon. Here we describe the architecture and training process of a new open-weights text-to-audio model trained with Creative Commons data. Our evaluation shows that the model's performance is competitive with the state-of-the-art across various metrics. Notably, the reported FDopenl3 results (measuring the realism of the generations) showcase its potential for high-quality stereo sound synthesis at 44.1kHz.*
 
-This pipeline was contributed by [Yoach Lacombe](https://huggingface.co/ylacombe). The original codebase can be found at [Stability-AI/stable-audio-tool](https://github.com/Stability-AI/stable-audio-tool).
+This pipeline was contributed by [Yoach Lacombe](https://huggingface.co/ylacombe). The original codebase can be found at [Stability-AI/stable-audio-tools](https://github.com/Stability-AI/stable-audio-tools).
 
 ## Tips
 

diff --git a/docs/source/en/optimization/fp16.md b/docs/source/en/optimization/fp16.md
@@ -125,3 +125,5 @@ image
     <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder</figcaption>
   </div>
 </div>
+
+More tiny autoencoder models for other Stable Diffusion models, like Stable Diffusion 3, are available from [madebyollin](https://huggingface.co/madebyollin).
diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md
@@ -48,7 +48,7 @@ accelerate launch run_distributed.py --num_processes=2
 
 <Tip>
 
-To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) guide.
+Refer to this minimal example [script](https://gist.github.com/sayakpaul/cfaebd221820d7b43fae638b4dfa01ba) for running inference across multiple GPUs. To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate) guide.
 
 </Tip>
 
@@ -108,4 +108,4 @@ torchrun run_distributed.py --nproc_per_node=2
 ```
 
 > [!TIP]
-> You can use `device_map` within a [`DiffusionPipeline`] to distribute its model-level components on multiple devices. Refer to the [Device placement](../tutorials/inference_with_big_models#device-placement) guide to learn more.
+> You can use `device_map` within a [`DiffusionPipeline`] to distribute its model-level components on multiple devices. Refer to the [Device placement](../tutorials/inference_with_big_models#device-placement) guide to learn more.
-Original file line number
+Diff line change
@@ Expand Up / @@ -125,3 +125,5 @@ image @@
         <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder</figcaption>
       </div>
     </div>
+    More tiny autoencoder models for other Stable Diffusion models, like Stable Diffusion 3, are available from [madebyollin](https://huggingface.co/madebyollin).